{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T08:55:40Z","timestamp":1773392140012,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,9]],"date-time":"2023-10-09T00:00:00Z","timestamp":1696809600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Coordena\u00e7\u00e3o de Aperfei\u00e7oamento de Pessoal de Nivel Superior \u00f0 Brasil (CAPES)","award":["001"],"award-info":[{"award-number":["001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,9]]},"DOI":"10.1145\/3610661.3616554","type":"proceedings-article","created":{"date-parts":[[2023,10,9]],"date-time":"2023-10-09T16:51:22Z","timestamp":1696870282000},"page":"193-199","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Gesture Generation with Diffusion Models Aided by Speech Activity Information"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5800-654X","authenticated-orcid":false,"given":"Rodolfo L.","family":"Tonoli","sequence":"first","affiliation":[{"name":"Dept. of Computer Engineering and Automation, School of Electrical and Computer Engineering, University of Campinas (UNICAMP), Brazil and Artificial Intelligence Lab., Recod.ai, University of Campinas (UNICAMP), Brazil"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8821-4972","authenticated-orcid":false,"given":"Leonardo B. de M. M.","family":"Marques","sequence":"additional","affiliation":[{"name":"CPQD, Brazil"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1029-3420","authenticated-orcid":false,"given":"Lucas H.","family":"Ueda","sequence":"additional","affiliation":[{"name":"CPQD, Brazil"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1534-5744","authenticated-orcid":false,"given":"Paula Dornhofer Paro","family":"Costa","sequence":"additional","affiliation":[{"name":"Dept. of Computer Engineering and Automation, School of Electrical and Computer Engineering, University of Campinas (UNICAMP), Brazil and Artificial Intelligence Lab., Recod.ai, University of Campinas (UNICAMP), Brazil"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,9]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"denoise, action! audio-driven motion synthesis with diffusion models. arXiv preprint arXiv:2211.09707","author":"Alexanderson Simon","year":"2022","unstructured":"Simon Alexanderson, Rajmund Nagy, Jonas Beskow, and Gustav\u00a0Eje Henter. 2022. Listen, denoise, action! audio-driven motion synthesis with diffusion models. arXiv preprint arXiv:2211.09707 (2022)."},{"key":"e_1_3_2_1_2_1","volume-title":"GestureDiffuCLIP: Gesture diffusion model with CLIP latents. arXiv preprint arXiv:2303.14613","author":"Ao Tenglong","year":"2023","unstructured":"Tenglong Ao, Zeyi Zhang, and Libin Liu. 2023. GestureDiffuCLIP: Gesture diffusion model with CLIP latents. arXiv preprint arXiv:2303.14613 (2023)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747707"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3558060"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_1_7_1","volume-title":"Diffusion models beat gans on image synthesis. Advances in neural information processing systems 34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in neural information processing systems 34 (2021), 8780\u20138794."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3359566.3360053"},{"key":"e_1_3_2_1_9_1","volume-title":"Computer Graphics Forum, Vol.\u00a042","author":"Ghorbani Saeed","unstructured":"Saeed Ghorbani, Ylva Ferstl, Daniel Holden, Nikolaus\u00a0F Troje, and Marc-Andr\u00e9 Carbonneau. 2023. ZeroEGGS: Zero-shot Example-based Gesture Generation from Speech. In Computer Graphics Forum, Vol.\u00a042. Wiley Online Library, 206\u2013216."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417836"},{"key":"e_1_3_2_1_11_1","volume-title":"Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, William Chan, Chitwan Saharia, Jay Whang, Ruiqi Gao, Alexey Gritsenko, Diederik\u00a0P Kingma, Ben Poole, Mohammad Norouzi, David\u00a0J Fleet, 2022. Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)."},{"key":"e_1_3_2_1_12_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems 33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020), 6840\u20136851."},{"key":"e_1_3_2_1_13_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.167"},{"key":"e_1_3_2_1_15_1","volume-title":"Diffwave: A versatile diffusion model for audio synthesis. arXiv preprint arXiv:2009.09761","author":"Kong Zhifeng","year":"2020","unstructured":"Zhifeng Kong, Wei Ping, Jiaji Huang, Kexin Zhao, and Bryan Catanzaro. 2020. Diffwave: A versatile diffusion model for audio synthesis. arXiv preprint arXiv:2009.09761 (2020)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397481.3450692"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577190.3616120"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV \u201919)","author":"Lee Gilwoo","year":"2019","unstructured":"Gilwoo Lee, Zhiwei Deng, Shugao Ma, Takaaki Shiratori, Siddhartha\u00a0S Srinivasa, and Yaser Sheikh. 2019. Talking with hands 16.2 m: A large-scale dataset of synchronized body-finger motion and audio for conversational motion analysis and synthesis. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV \u201919). 763\u2013772."},{"key":"e_1_3_2_1_19_1","volume-title":"Audioldm: Text-to-audio generation with latent diffusion models. arXiv preprint arXiv:2301.12503","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, Xubo Liu, Danilo Mandic, Wenwu Wang, and Mark\u00a0D Plumbley. 2023. Audioldm: Text-to-audio generation with latent diffusion models. arXiv preprint arXiv:2301.12503 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Savitzky\u2013Golay smoothing and differentiation filter for even number data. Signal processing 85, 7","author":"Luo Jianwen","year":"2005","unstructured":"Jianwen Luo, Kui Ying, and Jing Bai. 2005. Savitzky\u2013Golay smoothing and differentiation filter for even number data. Signal processing 85, 7 (2005), 1429\u20131434."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1177\/002383099403700208"},{"key":"e_1_3_2_1_22_1","volume-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2021. Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)."},{"key":"e_1_3_2_1_23_1","volume-title":"Computer Graphics Forum, Vol.\u00a042","author":"Nyatsanga Simbarashe","unstructured":"Simbarashe Nyatsanga, Taras Kucherenko, Chaitanya Ahuja, Gustav\u00a0Eje Henter, and Michael Neff. 2023. A Comprehensive Review of Data-Driven Co-Speech Gesture Generation. In Computer Graphics Forum, Vol.\u00a042. Wiley Online Library, 569\u2013596."},{"key":"e_1_3_2_1_24_1","volume-title":"International conference on machine learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_26_1","volume-title":"Human Motion Diffusion Model. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=SJ1kSyO2jwu","author":"Tevet Guy","year":"2023","unstructured":"Guy Tevet, Sigal Raab, Brian Gordon, Yoni Shafir, Daniel Cohen-or, and Amit\u00a0Haim Bermano. 2023. Human Motion Diffusion Model. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=SJ1kSyO2jwu"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00051"},{"key":"e_1_3_2_1_28_1","volume-title":"Mimicry and prosocial behavior. Psychological science 15, 1","author":"Van\u00a0Baaren B","year":"2004","unstructured":"Rick\u00a0B Van\u00a0Baaren, Rob\u00a0W Holland, Kerry Kawakami, and Ad Van\u00a0Knippenberg. 2004. Mimicry and prosocial behavior. Psychological science 15, 1 (2004), 71\u201374."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","unstructured":"V. Vinayagamoorthy M. Gillies A. Steed E. Tanguy X. Pan C. Loscos and M. Slater. 2006. Building Expression into Virtual Characters. In Eurographics 2006 - State of the Art Reports Brian Wyvill and Alexander Wilkie (Eds.). The Eurographics Association. https:\/\/doi.org\/10.2312\/egst.20061052","DOI":"10.2312\/egst.20061052"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Petra Wagner Zofia Malisz and Stefan Kopp. 2014. Gesture and speech in interaction: An overview. 209\u2013232\u00a0pages.","DOI":"10.1016\/j.specom.2013.09.008"},{"key":"e_1_3_2_1_31_1","volume-title":"Diffusion models: A comprehensive survey of methods and applications. arXiv preprint arXiv:2209.00796","author":"Yang Ling","year":"2022","unstructured":"Ling Yang, Zhilong Zhang, Yang Song, Shenda Hong, Runsheng Xu, Yue Zhao, Yingxia Shao, Wentao Zhang, Bin Cui, and Ming-Hsuan Yang. 2022. Diffusion models: A comprehensive survey of methods and applications. arXiv preprint arXiv:2209.00796 (2022)."},{"key":"e_1_3_2_1_32_1","volume-title":"DiffuseStyleGesture: Stylized Audio-Driven Co-Speech Gesture Generation with Diffusion Models. arXiv preprint arXiv:2305.04919","author":"Yang Sicheng","year":"2023","unstructured":"Sicheng Yang, Zhiyong Wu, Minglei Li, Zhensong Zhang, Lei Hao, Weihong Bao, Ming Cheng, and Long Xiao. 2023. DiffuseStyleGesture: Stylized Audio-Driven Co-Speech Gesture Generation with Diffusion Models. arXiv preprint arXiv:2305.04919 (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3558058"},{"key":"e_1_3_2_1_34_1","volume-title":"DiffMotion: Speech-Driven Gesture Synthesis Using Denoising Diffusion Model","author":"Zhang Fan","unstructured":"Fan Zhang, Naye Ji, Fuxing Gao, and Yongping Li. 2023. DiffMotion: Speech-Driven Gesture Synthesis Using Denoising Diffusion Model. In MultiMedia Modeling, Duc-Tien Dang-Nguyen, Cathal Gurrin, Martha Larson, Alan\u00a0F. Smeaton, Stevan Rudinac, Minh-Son Dao, Christoph Trattner, and Phoebe Chen (Eds.). Springer International Publishing, Cham, 231\u2013242."},{"key":"e_1_3_2_1_35_1","volume-title":"Motiondiffuse: Text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001","author":"Zhang Mingyuan","year":"2022","unstructured":"Mingyuan Zhang, Zhongang Cai, Liang Pan, Fangzhou Hong, Xinying Guo, Lei Yang, and Ziwei Liu. 2022. Motiondiffuse: Text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001 (2022)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00589"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01016"}],"event":{"name":"ICMI '23: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","location":"Paris France","acronym":"ICMI '23","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["International Cconference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3610661.3616554","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3610661.3616554","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:34:01Z","timestamp":1755891241000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3610661.3616554"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,9]]},"references-count":37,"alternative-id":["10.1145\/3610661.3616554","10.1145\/3610661"],"URL":"https:\/\/doi.org\/10.1145\/3610661.3616554","relation":{},"subject":[],"published":{"date-parts":[[2023,10,9]]},"assertion":[{"value":"2023-10-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}