{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:11:24Z","timestamp":1765357884505,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","funder":[{"name":"the open research fund of Pengcheng Laboratory","award":["2025KF1A0030"],"award-info":[{"award-number":["2025KF1A0030"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746268.3759436","type":"proceedings-article","created":{"date-parts":[[2025,10,24]],"date-time":"2025-10-24T15:00:16Z","timestamp":1761318016000},"page":"13-21","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["SARGes: Semantically Aligned Reliable Gesture Generation via Intent Chain"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8277-4100","authenticated-orcid":false,"given":"Nan","family":"Gao","sequence":"first","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4250-2258","authenticated-orcid":false,"given":"Yihua","family":"Bao","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2352-0896","authenticated-orcid":false,"given":"Dongdong","family":"Weng","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3789-000X","authenticated-orcid":false,"given":"Jiayi","family":"Zhao","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6077-7896","authenticated-orcid":false,"given":"Jia","family":"Li","sequence":"additional","affiliation":[{"name":"Lenovo, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0564-0915","authenticated-orcid":false,"given":"Yan","family":"Zhou","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7225-565X","authenticated-orcid":false,"given":"Pengfei","family":"Wan","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,26]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Hand and mind1,'' Advances in Visual Semiotics","author":"McNeill D.","year":"1992","unstructured":"D. McNeill, ''Hand and mind1,'' Advances in Visual Semiotics, vol. 351, 1992."},{"key":"e_1_3_2_1_2_1","volume-title":"Learning in audio-visual context: A review, analysis, and new perspective,'' arXiv preprint arXiv:2208.09579","author":"Wei Y.","year":"2022","unstructured":"Y. Wei, D. Hu, Y. Tian, and X. Li, ''Learning in audio-visual context: A review, analysis, and new perspective,'' arXiv preprint arXiv:2208.09579, 2022."},{"volume-title":"A comprehensive review of data-driven co-speech gesture generation,'' in Computer Graphics Forum","author":"Nyatsanga S.","key":"e_1_3_2_1_3_1","unstructured":"S. Nyatsanga, T. Kucherenko, C. Ahuja, G. E. Henter, and M. Neff, ''A comprehensive review of data-driven co-speech gesture generation,'' in Computer Graphics Forum, vol. 42, no. 2.hskip 1em plus 0.5em minus 0.4emrelax Wiley Online Library, 2023, pp. 569-596."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/1330511.1330516"},{"key":"e_1_3_2_1_5_1","volume-title":"Rogue: Robot gesture engine,'' in 2016 AAAI Spring Symposium Series","author":"Holladay R. M.","year":"2016","unstructured":"R. M. Holladay and S. S. Srinivasa, ''Rogue: Robot gesture engine,'' in 2016 AAAI Spring Symposium Series, 2016."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592458"},{"key":"e_1_3_2_1_7_1","first-page":"231","volume-title":"Diffmotion: Speech-driven gesture synthesis using denoising diffusion model,'' in International Conference on Multimedia Modeling. hskip 1em plus 0.5em minus 0.4emrelax Springer","author":"Zhang F.","year":"2023","unstructured":"F. Zhang, N. Ji, F. Gao, and Y. Li, ''Diffmotion: Speech-driven gesture synthesis using denoising diffusion model,'' in International Conference on Multimedia Modeling. hskip 1em plus 0.5em minus 0.4emrelax Springer, 2023, pp. 231-242."},{"key":"e_1_3_2_1_8_1","first-page":"7352","article-title":"Diffsheg: A diffusion-based approach for real-time speech-driven holistic 3d expression and gesture generation","author":"Chen J.","year":"2024","unstructured":"J. Chen, Y. Liu, J. Wang, A. Zeng, Y. Li, and Q. Chen, ''Diffsheg: A diffusion-based approach for real-time speech-driven holistic 3d expression and gesture generation,'' in Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 7352-7361.","journal-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592097"},{"key":"e_1_3_2_1_10_1","volume-title":"Motiongpt: Human motion as a foreign language,'' Advances in Neural Information Processing Systems","author":"Jiang B.","year":"2024","unstructured":"B. Jiang, X. Chen, W. Liu, J. Yu, G. Yu, and T. Chen, ''Motiongpt: Human motion as a foreign language,'' Advances in Neural Information Processing Systems, vol. 36, 2024."},{"key":"e_1_3_2_1_11_1","article-title":"Gesgpt: Speech gesture synthesis with text parsing from chatgpt","author":"Gao N.","year":"2024","unstructured":"N. Gao, Z. Zhao, Z. Zeng, S. Zhang, D. Weng, and Y. Bao, ''Gesgpt: Speech gesture synthesis with text parsing from chatgpt,'' IEEE Robotics and Automation Letters, 2024.","journal-title":"IEEE Robotics and Automation Letters"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3658134"},{"key":"e_1_3_2_1_13_1","volume-title":"A comprehensive survey of hallucination mitigation techniques in large language models,'' arXiv preprint arXiv:2401.01313","author":"Tonmoy S.","year":"2024","unstructured":"S. Tonmoy, S. Zaman, V. Jain, A. Rani, V. Rawte, A. Chadha, and A. Das, ''A comprehensive survey of hallucination mitigation techniques in large language models,'' arXiv preprint arXiv:2401.01313, 2024."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.applanim.2015.04.001"},{"key":"e_1_3_2_1_15_1","first-page":"242","volume-title":"Gesticulator: A framework for semantically-aware speech-driven gesture generation,'' in Proceedings of the 2020 international conference on multimodal interaction","author":"Kucherenko T.","year":"2020","unstructured":"T. Kucherenko, P. Jonell, S. Van Waveren, G. E. Henter, S. Alexandersson, I. Leite, and H. Kjellstr\u00f6m, ''Gesticulator: A framework for semantically-aware speech-driven gesture generation,'' in Proceedings of the 2020 international conference on multimodal interaction, 2020, pp. 242-250."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_1_17_1","first-page":"10","article-title":"Seeg: Semantic energized co-speech gesture generation","author":"Liang Y.","year":"2022","unstructured":"Y. Liang, Q. Feng, L. Zhu, L. Hu, P. Pan, and Y. Yang, ''Seeg: Semantic energized co-speech gesture generation,'' in Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 10,473-10,482.","journal-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"H. Teshima N. Wake D. Thomas Y. Nakashima H. Kawasaki and K. Ikeuchi ''Deep gesture generation for social robots using type-specific libraries '' in 2022 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS). hskip 1em plus 0.5em minus 0.4emrelax IEEE 2022 pp. 8286-8291.","DOI":"10.1109\/IROS47612.2022.9981734"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"U. Bhattacharya N. Rewkowski A. Banerjee P. Guhan A. Bera and D. Manocha ''Text2gestures: A transformer-based network for generating emotive body gestures for virtual agents '' in 2021 IEEE virtual reality and 3D user interfaces (VR). hskip 1em plus 0.5em minus 0.4emrelax IEEE 2021 pp. 1-10.","DOI":"10.1109\/VR50410.2021.00037"},{"key":"e_1_3_2_1_20_1","first-page":"3418","article-title":"Audio-driven neural gesture reenactment with video motion graphs","author":"Zhou Y.","year":"2022","unstructured":"Y. Zhou, J. Yang, D. Li, J. Saito, D. Aneja, and E. Kalogerakis, ''Audio-driven neural gesture reenactment with video motion graphs,'' in Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 2022, pp. 3418-3428.","journal-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition"},{"key":"e_1_3_2_1_21_1","first-page":"772","article-title":"Gesture motion graphs for few-shot speech-driven gesture reenactment","author":"Zhao Z.","year":"2023","unstructured":"Z. Zhao, N. Gao, Z. Zeng, G. Zhang, J. Liu, and S. Zhang, ''Gesture motion graphs for few-shot speech-driven gesture reenactment,'' in Proceedings of the 25th International Conference on Multimodal Interaction, 2023, pp. 772-778.","journal-title":"Proceedings of the 25th International Conference on Multimodal Interaction"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"S. Zhang J. Yuan M. Liao and L. Zhang ''Text2video: Text-driven talking-head video synthesis with personalized phoneme-pose dictionary '' in ICASSP 2022-2022 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP). hskip 1em plus 0.5em minus 0.4emrelax IEEE 2022 pp. 2659-2663.","DOI":"10.1109\/ICASSP43922.2022.9747380"},{"key":"e_1_3_2_1_23_1","first-page":"1","volume-title":"Story-to-motion: Synthesizing infinite and controllable character animation from long text,'' in SIGGRAPH Asia 2023 Technical Communications","author":"Qing Z.","year":"2023","unstructured":"Z. Qing, Z. Cai, Z. Yang, and L. Yang, ''Story-to-motion: Synthesizing infinite and controllable character animation from long text,'' in SIGGRAPH Asia 2023 Technical Communications, 2023, pp. 1-4."},{"key":"e_1_3_2_1_24_1","first-page":"582","article-title":"Digital life project: Autonomous 3d characters with social intelligence","author":"Cai Z.","year":"2024","unstructured":"Z. Cai, J. Jiang, Z. Qing, X. Guo, M. Zhang, Z. Lin, H. Mei, C. Wei, R. Wang, W. Yin et al., ''Digital life project: Autonomous 3d characters with social intelligence,'' in Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 582-592.","journal-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition"},{"key":"e_1_3_2_1_25_1","first-page":"9118","volume-title":"Language models as zero-shot planners: Extracting actionable knowledge for embodied agents,'' in International conference on machine learning. hskip 1em plus 0.5em minus 0.4emrelax PMLR","author":"Huang W.","year":"2022","unstructured":"W. Huang, P. Abbeel, D. Pathak, and I. Mordatch, ''Language models as zero-shot planners: Extracting actionable knowledge for embodied agents,'' in International conference on machine learning. hskip 1em plus 0.5em minus 0.4emrelax PMLR, 2022, pp. 9118-9147."},{"key":"e_1_3_2_1_26_1","first-page":"1","article-title":"Generative agents: Interactive simulacra of human behavior","author":"Park J. S.","year":"2023","unstructured":"J. S. Park, J. O'Brien, C. J. Cai, M. R. Morris, P. Liang, and M. S. Bernstein, ''Generative agents: Interactive simulacra of human behavior,'' in Proceedings of the 36th annual acm symposium on user interface software and technology, 2023, pp. 1-22.","journal-title":"Proceedings of the 36th annual acm symposium on user interface software and technology"},{"key":"e_1_3_2_1_27_1","first-page":"24","volume-title":"Zhou et al., ''Chain-of-thought prompting elicits reasoning in large language models,'' Advances in neural information processing systems","author":"Wei J.","year":"2022","unstructured":"J. Wei, X. Wang, D. Schuurmans, M. Bosma, F. Xia, E. Chi, Q. V. Le, D. Zhou et al., ''Chain-of-thought prompting elicits reasoning in large language models,'' Advances in neural information processing systems, vol. 35, pp. 24,824-24,837, 2022."},{"key":"e_1_3_2_1_28_1","volume-title":"Reflexion: Language agents with verbal reinforcement learning.(2023),'' arXiv preprint cs.AI\/2303.11366","author":"Shinn N.","year":"2023","unstructured":"N. Shinn, F. Cassano, B. Labash, A. Gopinath, K. Narasimhan, and S. Yao, ''Reflexion: Language agents with verbal reinforcement learning.(2023),'' arXiv preprint cs.AI\/2303.11366, 2023."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.2527\/jas1987.6551213x"},{"key":"e_1_3_2_1_30_1","volume-title":"Contrastive preference optimization: Pushing the boundaries of llm performance in machine translation,'' arXiv preprint arXiv:2401.08417","author":"Xu H.","year":"2024","unstructured":"H. Xu, A. Sharaf, Y. Chen, W. Tan, L. Shen, B. Van Durme, K. Murray, and Y. J. Kim, ''Contrastive preference optimization: Pushing the boundaries of llm performance in machine translation,'' arXiv preprint arXiv:2401.08417, 2024."},{"key":"e_1_3_2_1_31_1","volume-title":"Huang et al., ''Qwen technical report,'' arXiv preprint arXiv:2309.16609","author":"Bai J.","year":"2023","unstructured":"J. Bai, S. Bai, Y. Chu, Z. Cui, K. Dang, X. Deng, Y. Fan, W. Ge, Y. Han, F. Huang et al., ''Qwen technical report,'' arXiv preprint arXiv:2309.16609, 2023."},{"key":"e_1_3_2_1_32_1","first-page":"1","article-title":"Using an llm to help with code understanding","author":"Nam D.","year":"2024","unstructured":"D. Nam, A. Macvean, V. Hellendoorn, B. Vasilescu, and B. Myers, ''Using an llm to help with code understanding,'' in Proceedings of the IEEE\/ACM 46th International Conference on Software Engineering, 2024, pp. 1-13.","journal-title":"Proceedings of the IEEE\/ACM 46th International Conference on Software Engineering"},{"key":"e_1_3_2_1_33_1","first-page":"1","article-title":"Using an llm to help with code understanding","author":"Nam D.","year":"2024","unstructured":"D. Nam, A. Macvean, V. Hellendoorn, B. Vasilescu, and B. Myers, ''Using an llm to help with code understanding,'' in Proceedings of the IEEE\/ACM 46th International Conference on Software Engineering, 2024, pp. 1-13.","journal-title":"Proceedings of the IEEE\/ACM 46th International Conference on Software Engineering"},{"key":"e_1_3_2_1_34_1","first-page":"1","article-title":"DiffuseStyleGesture: Stylized audio-driven co-speech gesture generation with diffusion models","author":"Yang S.","year":"2023","unstructured":"S. Yang, Z. Wu, M. Li, Z. Zhang, L. Hao, W. Bao, M. Cheng, and L. Xiao, ''DiffuseStyleGesture: Stylized audio-driven co-speech gesture generation with diffusion models,'' in Proceedings of the 32nd International Joint Conference on Artificial Intelligence, pp.1-11, 2023.","journal-title":"Proceedings of the 32nd International Joint Conference on Artificial Intelligence"}],"event":{"name":"MM '25:The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland"},"container-title":["Proceedings of the International Workshop on Generation and Evaluation of Non-verbal Behaviour for Embodied Agents"],"original-title":[],"deposited":{"date-parts":[[2025,10,24]],"date-time":"2025-10-24T15:00:22Z","timestamp":1761318022000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746268.3759436"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,26]]},"references-count":34,"alternative-id":["10.1145\/3746268.3759436","10.1145\/3746268"],"URL":"https:\/\/doi.org\/10.1145\/3746268.3759436","relation":{},"subject":[],"published":{"date-parts":[[2025,10,26]]},"assertion":[{"value":"2025-10-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}