{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,24]],"date-time":"2025-10-24T15:09:59Z","timestamp":1761318599067,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","funder":[{"name":"Japan Society for the Promotion of Science","award":["JP24H00733"],"award-info":[{"award-number":["JP24H00733"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746268.3759433","type":"proceedings-article","created":{"date-parts":[[2025,10,24]],"date-time":"2025-10-24T15:00:16Z","timestamp":1761318016000},"page":"35-43","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SemGest: A Multimodal Feature Space Alignment and Fusion Framework for Semantic-aware Co-speech Gesture Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-2811-8764","authenticated-orcid":false,"given":"Yo-Hsin","family":"Fang","sequence":"first","affiliation":[{"name":"RIKEN, Kyoto, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9553-0906","authenticated-orcid":false,"given":"Vijay","family":"John","sequence":"additional","affiliation":[{"name":"RIKEN, Kyoto, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3799-4550","authenticated-orcid":false,"given":"Yasutomo","family":"Kawanishi","sequence":"additional","affiliation":[{"name":"RIKEN, Kyoto, Japan"}]}],"member":"320","published-online":{"date-parts":[[2025,10,26]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings, Part XVIII 16","author":"Ahuja Chaitanya","year":"2020","unstructured":"Chaitanya Ahuja, Dong Won Lee, Yukiko I Nakano, and Louis-Philippe Morency. 2020. Style transfer for co-speech gesture animation: A multi-speaker conditional mixture approach. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XVIII 16. Springer, 248--265."},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the 2021 International Conference on 3D Vision. IEEE, 565--574","author":"Aksan Emre","year":"2021","unstructured":"Emre Aksan, Manuel Kaufmann, Peng Cao, and Otmar Hilliges. 2021. A spatiotemporal transformer for 3D human motion prediction. In Proceedings of the 2021 International Conference on 3D Vision. IEEE, 565--574."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475223"},{"volume-title":"Computer Vision for Human--Machine Interaction: A Framework for Gesture Generation and Interpretation","author":"Cassell Justine","key":"e_1_3_2_1_4_1","unstructured":"Justine Cassell. 1998. Computer Vision for Human--Machine Interaction: A Framework for Gesture Generation and Interpretation. Cambridge university press. https:\/\/api.semanticscholar.org\/CorpusID:61011414"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/192161.192272"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/383259.383315"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680847"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00702"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00190"},{"key":"e_1_3_2_1_10_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ArXiv abs\/2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ArXiv abs\/2010.11929 (2020). https:\/\/api.semanticscholar.org\/CorpusID:225039882"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680625"},{"volume-title":"Computer Graphics Forum","author":"Ghorbani Saeed","key":"e_1_3_2_1_12_1","unstructured":"Saeed Ghorbani, Ylva Ferstl, Daniel Holden, Nikolaus F Troje, and Marc-Andr\u00e9 Carbonneau. 2023. ZeroEGGS: Zero-shot Example-based Gesture Generation from Speech. In Computer Graphics Forum, Vol. 42. Wiley Online Library, 206--216."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530750"},{"key":"e_1_3_2_1_15_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems 33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020), 6840--6851."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3382507.3418815"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01022"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680892"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00115"},{"key":"e_1_3_2_1_21_1","volume-title":"BEAT: A Large-Scale Semantic and Emotional Multi-Modal Dataset for Conversational Gestures Synthesis. arXiv preprint arXiv:2203.05297","author":"Liu Haiyang","year":"2022","unstructured":"Haiyang Liu, Zihao Zhu, Naoya Iwamoto, Yichen Peng, Zhengqing Li, You Zhou, Elif Bozkurt, and Bo Zheng. 2022. BEAT: A Large-Scale Semantic and Emotional Multi-Modal Dataset for Conversational Gestures Synthesis. arXiv preprint arXiv:2203.05297 (2022)."},{"key":"e_1_3_2_1_22_1","unstructured":"Shilong Liu Zhaoyang Zeng Tianhe Ren Feng Li Hao Zhang Jie Yang Chunyuan Li Jianwei Yang Hang Su Jun Zhu et al. 2023. Grounding DINO: Marrying DINO with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680684"},{"key":"e_1_3_2_1_25_1","volume-title":"Emotion Gesture: Audio-driven diverse emotional co-speech 3D gesture generation","author":"Qi Xingqun","year":"2024","unstructured":"Xingqun Qi, Chen Liu, Lincheng Li, Jie Hou, Haoran Xin, and Xin Yu. 2024. Emotion Gesture: Audio-driven diverse emotional co-speech 3D gesture generation. IEEE Transactions on Multimedia (2024)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00992"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:231591445","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:231591445"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_29_1","volume-title":"Denoising Diffusion Implicit Models. arXiv:2010.02502 (October","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising Diffusion Implicit Models. arXiv:2010.02502 (October 2020). https:\/\/arxiv.org\/abs\/2010.02502"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_21"},{"key":"e_1_3_2_1_31_1","volume-title":"MMo- Fusion: Multi-modal Co-Speech Motion Generation with Diffusion Model. ArXiv abs\/2403.02905","author":"Wang Sen","year":"2024","unstructured":"Sen Wang, Jiangning Zhang, Weijian Cao, Xiaobin Hu, Moran Li, Xiaozhong Ji, Xin Tan, Mengtian Li, Zhifeng Xie, Chengjie Wang, and Lizhuang Ma. 2024. MMo- Fusion: Multi-modal Co-Speech Motion Generation with Diffusion Model. ArXiv abs\/2403.02905 (2024). https:\/\/api.semanticscholar.org\/CorpusID:268247486"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01883"},{"key":"e_1_3_2_1_33_1","volume-title":"DiffuseStyleGesture: Stylized audio-driven cospeech gesture generation with diffusion models. arXiv preprint arXiv:2305.04919","author":"Yang Sicheng","year":"2023","unstructured":"Sicheng Yang, Zhiyong Wu, Minglei Li, Zhensong Zhang, Lei Hao, Weihong Bao, Ming Cheng, and Long Xiao. 2023. DiffuseStyleGesture: Stylized audio-driven cospeech gesture generation with diffusion models. arXiv preprint arXiv:2305.04919 (2023)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00230"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447978"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793720"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01902"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01016"}],"event":{"name":"MM '25:The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland"},"container-title":["Proceedings of the International Workshop on Generation and Evaluation of Non-verbal Behaviour for Embodied Agents"],"original-title":[],"deposited":{"date-parts":[[2025,10,24]],"date-time":"2025-10-24T15:00:33Z","timestamp":1761318033000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746268.3759433"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,26]]},"references-count":40,"alternative-id":["10.1145\/3746268.3759433","10.1145\/3746268"],"URL":"https:\/\/doi.org\/10.1145\/3746268.3759433","relation":{},"subject":[],"published":{"date-parts":[[2025,10,26]]},"assertion":[{"value":"2025-10-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}