{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T20:35:13Z","timestamp":1778877313997,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681392","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"6113-6122","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["EGGesture: Entropy-Guided Vector Quantized Variational AutoEncoder for Co-Speech Gesture Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-5645-7708","authenticated-orcid":false,"given":"Yiyong","family":"Xiao","sequence":"first","affiliation":[{"name":"Taiyuan Normal University, Taiyuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3302-6003","authenticated-orcid":false,"given":"Kai","family":"Shu","sequence":"additional","affiliation":[{"name":"University of Southern California, Los Angeles, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3693-786X","authenticated-orcid":false,"given":"Haoyi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong-Liverpool University, SuZhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9917-2776","authenticated-orcid":false,"given":"Baohua","family":"Yin","sequence":"additional","affiliation":[{"name":"Chongqing University of Science &amp; Technology, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3376-3544","authenticated-orcid":false,"given":"Wai Seng","family":"Cheang","sequence":"additional","affiliation":[{"name":"King's College London, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7526-3185","authenticated-orcid":false,"given":"Haoyang","family":"Wang","sequence":"additional","affiliation":[{"name":"St.Cloud State University, St.Cloud, MN, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0628-1416","authenticated-orcid":false,"given":"Jiechao","family":"Gao","sequence":"additional","affiliation":[{"name":"University of Virginia, Charlottesville, VA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Taras Kucherenko, and Jonas Beskow.","author":"Alexanderson Simon","year":"2020","unstructured":"Simon Alexanderson, Gustav Eje Henter, Taras Kucherenko, and Jonas Beskow. 2020. Style-Controllable Speech-Driven Gesture Synthesis Using Normalising Flows. In Computer Graphics Forum, Vol. 39. Wiley Online Library, 487--496."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555435"},{"key":"e_1_3_2_2_3_1","volume-title":"GestureDiffuCLIP: Gesture diffusion model with CLIP latents. arXiv preprint arXiv:2303.14613","author":"Ao Tenglong","year":"2023","unstructured":"Tenglong Ao, Zeyi Zhang, and Libin Liu. 2023. GestureDiffuCLIP: Gesture diffusion model with CLIP latents. arXiv preprint arXiv:2303.14613 (2023)."},{"key":"e_1_3_2_2_4_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33 (2020), 12449--12460."},{"key":"e_1_3_2_2_5_1","volume-title":"Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254","author":"Bao Hangbo","year":"2021","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2021. Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2018.07.011"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00949"},{"key":"e_1_3_2_2_8_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267851.3267898"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2020.04.007"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00361"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_34"},{"key":"e_1_3_2_2_13_1","volume-title":"Learning Speech-driven 3D Conversational Gestures from Video. arXiv preprint arXiv:2102.06837","author":"Habibie Ikhsanul","year":"2021","unstructured":"Ikhsanul Habibie, Weipeng Xu, Dushyant Mehta, Lingjie Liu, Hans-Peter Seidel, Gerard Pons-Moll, Mohamed Elgharib, and Christian Theobalt. 2021. Learning Speech-driven 3D Conversational Gestures from Video. arXiv preprint arXiv:2102.06837 (2021)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/11538059_91"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417836"},{"key":"e_1_3_2_2_17_1","volume-title":"FINEMATCH: Aspect-based Fine-grained Image and Text Mismatch Detection and Correction. arXiv preprint arXiv:2404.14715","author":"Hua Hang","year":"2024","unstructured":"Hang Hua, Jing Shi, Kushal Kafle, Simon Jenni, Daoan Zhang, John Collomosse, Scott Cohen, and Jiebo Luo. 2024. FINEMATCH: Aspect-based Fine-grained Image and Text Mismatch Detection and Correction. arXiv preprint arXiv:2404.14715 (2024)."},{"key":"e_1_3_2_2_18_1","volume-title":"V2xum-llm: Cross-modal video summarization with temporal prompt instruction tuning. arXiv preprint arXiv:2404.12353","author":"Hua Hang","year":"2024","unstructured":"Hang Hua, Yunlong Tang, Chenliang Xu, and Jiebo Luo. 2024. V2xum-llm: Cross-modal video summarization with temporal prompt instruction tuning. arXiv preprint arXiv:2404.12353 (2024)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.580"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00199"},{"key":"e_1_3_2_2_21_1","volume-title":"International conference on machine learning. pmlr, 448--456","author":"Ioffe Sergey","year":"2015","unstructured":"Sergey Ioffe and Christian Szegedy. 2015. Batch normalization: Accelerating deep network training by reducing internal covariate shift. In International conference on machine learning. pmlr, 448--456."},{"key":"e_1_3_2_2_22_1","volume-title":"Categorical reparameterization with gumbel-softmax. arXiv preprint arXiv:1611.01144","author":"Jang Eric","year":"2016","unstructured":"Eric Jang, Shixiang Gu, and Ben Poole. 2016. Categorical reparameterization with gumbel-softmax. arXiv preprint arXiv:1611.01144 (2016)."},{"key":"e_1_3_2_2_23_1","volume-title":"Decoupling Representation and Classifier for Long-Tailed Recognition. In International Conference on Learning Representations.","author":"Kang Bingyi","year":"2019","unstructured":"Bingyi Kang, Saining Xie, Marcus Rohrbach, Zhicheng Yan, Albert Gordo, Jiashi Feng, and Yannis Kalantidis. 2019. Decoupling Representation and Classifier for Long-Tailed Recognition. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"e_1_3_2_2_25_1","volume-title":"Feature manipulation for ddpm based change detection. arXiv preprint arXiv:2403.15943","author":"Li Zhenglin","year":"2024","unstructured":"Zhenglin Li, Yangchen Huang, Mengran Zhu, Jingyu Zhang, JingHao Chang, and Houze Liu. 2024. Feature manipulation for ddpm based change detection. arXiv preprint arXiv:2403.15943 (2024)."},{"key":"e_1_3_2_2_26_1","volume-title":"Stock market analysis and prediction using LSTM: A case study on technology stocks. Innovations in Applied Engineering and Technology","author":"Li Zhenglin","year":"2023","unstructured":"Zhenglin Li, Hanyi Yu, Jinxin Xu, Jihang Liu, and Yuhong Mo. 2023. Stock market analysis and prediction using LSTM: A case study on technology stocks. Innovations in Applied Engineering and Technology (2023), 1--6."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548400"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/HSI49210.2020.9142625"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN48605.2020.9207530"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00115"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_36"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2023.3318856"},{"key":"e_1_3_2_2_35_1","volume-title":"Co-Speech Gesture Synthesis using Discrete Gesture Token Learning. arXiv preprint arXiv:2303.12822","author":"Lu Shuhong","year":"2023","unstructured":"Shuhong Lu, Youngwoo Yoon, and Andrew Feng. 2023. Co-Speech Gesture Synthesis using Discrete Gesture Token Learning. arXiv preprint arXiv:2303.12822 (2023)."},{"key":"e_1_3_2_2_36_1","volume-title":"Proceedings of the2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. 4727--4741","author":"Lyu Weimin","year":"2022","unstructured":"Weimin Lyu, Songzhu Zheng, Tengfei Ma, and Chao Chen. 2022. A Study of the AttentionAbnormality in Trojaned BERTs. In Proceedings of the2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. 4727--4741."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.716"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01123"},{"key":"e_1_3_2_2_39_1","volume-title":"Aaron Van den Oord, and Oriol Vinyals","author":"Razavi Ali","year":"2019","unstructured":"Ali Razavi, Aaron Van den Oord, and Oriol Vinyals. 2019. Generating diverse high-fidelity images with vq-vae-2. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_2_40_1","volume-title":"Minhua Wu, Ariya Rastrow, Andreas Stolcke, Jasha Droppo, and Roland Maas.","author":"Sadhu Samik","year":"2021","unstructured":"Samik Sadhu, Di He, Che-Wei Huang, Sri Harish Mallidi, Minhua Wu, Ariya Rastrow, Andreas Stolcke, Jasha Droppo, and Roland Maas. 2021. Wav2vecc: A self-supervised model for speech representation learning. arXiv preprint arXiv:2103.08393 (2021)."},{"key":"e_1_3_2_2_41_1","volume-title":"Sq-vae: Variational bayes on discrete representation with self-annealed stochastic quantization. arXiv preprint arXiv:2205.07547","author":"Takida Yuhta","year":"2022","unstructured":"Yuhta Takida, Takashi Shibuya, WeiHsiang Liao, Chieh-Hsin Lai, Junki Ohmura, Toshimitsu Uesaka, Naoki Murata, Shusuke Takahashi, Toshiyuki Kumakura, and Yuki Mitsufuji. 2022. Sq-vae: Variational bayes on discrete representation with self-annealed stochastic quantization. arXiv preprint arXiv:2205.07547 (2022)."},{"key":"e_1_3_2_2_42_1","volume-title":"AVicuna: Audio-Visual LLM with Interleaver and Context-Boundary Alignment for Temporal Referential Dialogue. arXiv preprint arXiv:2403.16276","author":"Tang Yunlong","year":"2024","unstructured":"Yunlong Tang, Daiki Shimada, Jing Bi, and Chenliang Xu. 2024. AVicuna: Audio-Visual LLM with Interleaver and Context-Boundary Alignment for Temporal Referential Dialogue. arXiv preprint arXiv:2403.16276 (2024)."},{"key":"e_1_3_2_2_43_1","unstructured":"Aaron Van Den Oord Oriol Vinyals et al. 2017. Neural discrete representation learning. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096402"},{"key":"e_1_3_2_2_45_1","volume-title":"DiffuseStyleGesture: Stylized Audio-Driven Co-Speech Gesture Generation with Diffusion Models. arXiv preprint arXiv:2305.04919","author":"Yang Sicheng","year":"2023","unstructured":"Sicheng Yang, Zhiyong Wu, Minglei Li, Zhensong Zhang, Lei Hao, Weihong Bao, Ming Cheng, and Long Xiao. 2023. DiffuseStyleGesture: Stylized Audio-Driven Co-Speech Gesture Generation with Diffusion Models. arXiv preprint arXiv:2305.04919 (2023)."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00230"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793720"},{"key":"e_1_3_2_2_50_1","volume-title":"Han Zhang, Ruoming Pang, James Qin, Alexander Ku, Yuanzhong Xu, Jason Baldridge, and Yonghui Wu.","author":"Yu Jiahui","year":"2021","unstructured":"Jiahui Yu, Xin Li, Jing Yu Koh, Han Zhang, Ruoming Pang, James Qin, Alexander Ku, Yuanzhong Xu, Jason Baldridge, and Yonghui Wu. 2021. Vector-quantized image modeling with improved vqgan. arXiv preprint arXiv:2110.04627 (2021)."},{"key":"e_1_3_2_2_51_1","volume-title":"Prompt-Fix: You Prompt and We Fix the Photo. arXiv preprint arXiv:2405.16785","author":"Yu Yongsheng","year":"2024","unstructured":"Yongsheng Yu, Ziyun Zeng, Hang Hua, Jianlong Fu, and Jiebo Luo. 2024. Prompt-Fix: You Prompt and We Fix the Photo. arXiv preprint arXiv:2405.16785 (2024)."},{"key":"e_1_3_2_2_52_1","volume-title":"Research on the Application of Computer Vision Based on Deep Learning in Autonomous Driving Technology. arXiv preprint arXiv:2406.00490","author":"Zhang Jingyu","year":"2024","unstructured":"Jingyu Zhang, Jin Cao, Jinghao Chang, Xinjin Li, Houze Liu, and Zhenglin Li. 2024. Research on the Application of Computer Vision Based on Deep Learning in Autonomous Driving Technology. arXiv preprint arXiv:2406.00490 (2024)."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01016"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681392","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681392","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:44Z","timestamp":1750295864000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681392"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":53,"alternative-id":["10.1145\/3664647.3681392","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681392","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}