{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T08:56:10Z","timestamp":1773392170272,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,9]],"date-time":"2023-10-09T00:00:00Z","timestamp":1696809600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,9]]},"DOI":"10.1145\/3610661.3616552","type":"proceedings-article","created":{"date-parts":[[2023,10,9]],"date-time":"2023-10-09T16:51:22Z","timestamp":1696870282000},"page":"179-185","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["DiffuGesture: Generating Human Gesture From Two-person Dialogue With Diffusion Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-2661-3692","authenticated-orcid":false,"given":"Weiyu","family":"Zhao","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5233-473X","authenticated-orcid":false,"given":"Liangxiao","family":"Hu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5200-3420","authenticated-orcid":false,"given":"Shengping","family":"Zhang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,9]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"denoise, action! Audio-driven motion synthesis with diffusion models. arXiv preprint arXiv:2211.09707","author":"Alexanderson Simon","year":"2022","unstructured":"Simon Alexanderson, Rajmund Nagy, Jonas Beskow, and Gustav\u00a0Eje Henter. 2022. Listen, denoise, action! Audio-driven motion synthesis with diffusion models. arXiv preprint arXiv:2211.09707 (2022)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555435"},{"key":"e_1_3_2_1_3_1","volume-title":"Text2gestures: A transformer-based network for generating emotive body gestures for virtual agents. In 2021 IEEE virtual reality and 3D user interfaces (VR)","author":"Bhattacharya Uttaran","unstructured":"Uttaran Bhattacharya, Nicholas Rewkowski, Abhishek Banerjee, Pooja Guhan, Aniket Bera, and Dinesh Manocha. 2021. Text2gestures: A transformer-based network for generating emotive body gestures for virtual agents. In 2021 IEEE virtual reality and 3D user interfaces (VR). IEEE, 1\u201310."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3558060"},{"key":"e_1_3_2_1_5_1","volume-title":"Diffusion models beat gans on image synthesis. Advances in neural information processing systems 34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in neural information processing systems 34 (2021), 8780\u20138794."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"S. Ginosar A. Bar G. Kohavi C. Chan A. Owens and J. Malik. 2019. Learning Individual Styles of Conversational Gesture. In Computer Vision and Pattern Recognition (CVPR). IEEE.","DOI":"10.1109\/CVPR.2019.00361"},{"key":"e_1_3_2_1_7_1","first-page":"27953","article-title":"Flexible diffusion modeling of long videos","volume":"35","author":"Harvey William","year":"2022","unstructured":"William Harvey, Saeid Naderiparizi, Vaden Masrani, Christian Weilbach, and Frank Wood. 2022. Flexible diffusion modeling of long videos. Advances in Neural Information Processing Systems 35 (2022), 27953\u201327965.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_8_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems 33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020), 6840\u20136851."},{"key":"e_1_3_2_1_9_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_1_10_1","volume-title":"MotionGPT: Human Motion as a Foreign Language. arXiv preprint arXiv:2306.14795","author":"Jiang Biao","year":"2023","unstructured":"Biao Jiang, Xin Chen, Wen Liu, Jingyi Yu, Gang Yu, and Tao Chen. 2023. MotionGPT: Human Motion as a Foreign Language. arXiv preprint arXiv:2306.14795 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma P","year":"2013","unstructured":"Diederik\u00a0P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3382507.3418815"},{"key":"e_1_3_2_1_13_1","volume-title":"GENEA Workshop 2021: The 2nd Workshop on Generation and Evaluation of Non-verbal Behaviour for Embodied Agents. In Proceedings of the 2021 International Conference on Multimodal Interaction. 872\u2013873","author":"Kucherenko Taras","year":"2021","unstructured":"Taras Kucherenko, Patrik Jonell, Youngwoo Yoon, Pieter Wolfert, Zerrin Yumak, and Gustav Henter. 2021. GENEA Workshop 2021: The 2nd Workshop on Generation and Evaluation of Non-verbal Behaviour for Embodied Agents. In Proceedings of the 2021 International Conference on Multimodal Interaction. 872\u2013873."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577190.3616120"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 763\u2013772","author":"Lee Gilwoo","year":"2019","unstructured":"Gilwoo Lee, Zhiwei Deng, Shugao Ma, Takaaki Shiratori, Siddhartha\u00a0S Srinivasa, and Yaser Sheikh. 2019. Talking with hands 16.2 m: A large-scale dataset of synchronized body-finger motion and audio for conversational motion analysis and synthesis. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 763\u2013772."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Sergey Levine Philipp Kr\u00e4henb\u00fchl Sebastian Thrun and Vladlen Koltun. 2010. Gesture controllers. In Acm siggraph 2010 papers. 1\u201311.","DOI":"10.1145\/1833349.1778861"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20014"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3558059"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1473-4192.2004.0057m.x"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01089"},{"key":"e_1_3_2_1_23_1","volume-title":"International conference on machine learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461967"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_21"},{"key":"e_1_3_2_1_27_1","volume-title":"Human motion diffusion model. arXiv preprint arXiv:2209.14916","author":"Tevet Guy","year":"2022","unstructured":"Guy Tevet, Sigal Raab, Brian Gordon, Yonatan Shafir, Daniel Cohen-Or, and Amit\u00a0H Bermano. 2022. Human motion diffusion model. arXiv preprint arXiv:2209.14916 (2022)."},{"key":"e_1_3_2_1_28_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Petra Wagner Zofia Malisz and Stefan Kopp. 2014. Gesture and speech in interaction: An overview. 209\u2013232\u00a0pages.","DOI":"10.1016\/j.specom.2013.09.008"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.3390\/electronics10030228"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00407"},{"key":"e_1_3_2_1_32_1","volume-title":"EMoG: Synthesizing Emotive Co-speech 3D Gesture with Diffusion Model. arXiv preprint arXiv:2306.11496","author":"Yin Lianying","year":"2023","unstructured":"Lianying Yin, Yijun Wang, Tianyu He, Jinming Liu, Wei Zhao, Bohan Li, Xin Jin, and Jianxin Lin. 2023. EMoG: Synthesizing Emotive Co-speech 3D Gesture with Diffusion Model. arXiv preprint arXiv:2306.11496 (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793720"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3558058"},{"key":"e_1_3_2_1_36_1","volume-title":"T2m-gpt: Generating human motion from textual descriptions with discrete representations. arXiv preprint arXiv:2301.06052","author":"Zhang Jianrong","year":"2023","unstructured":"Jianrong Zhang, Yangsong Zhang, Xiaodong Cun, Shaoli Huang, Yong Zhang, Hongwei Zhao, Hongtao Lu, and Xi Shen. 2023. T2m-gpt: Generating human motion from textual descriptions with discrete representations. arXiv preprint arXiv:2301.06052 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"Motiondiffuse: Text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001","author":"Zhang Mingyuan","year":"2022","unstructured":"Mingyuan Zhang, Zhongang Cai, Liang Pan, Fangzhou Hong, Xinying Guo, Lei Yang, and Ziwei Liu. 2022. Motiondiffuse: Text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001 (2022)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3558063"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01016"}],"event":{"name":"ICMI '23: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","location":"Paris France","acronym":"ICMI '23","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["International Cconference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3610661.3616552","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3610661.3616552","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:33:50Z","timestamp":1755891230000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3610661.3616552"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,9]]},"references-count":39,"alternative-id":["10.1145\/3610661.3616552","10.1145\/3610661"],"URL":"https:\/\/doi.org\/10.1145\/3610661.3616552","relation":{},"subject":[],"published":{"date-parts":[[2023,10,9]]},"assertion":[{"value":"2023-10-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}