{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,24]],"date-time":"2026-06-24T16:22:42Z","timestamp":1782318162893,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,8,7]],"date-time":"2022-08-07T00:00:00Z","timestamp":1659830400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100000781","name":"European Research Council","doi-asserted-by":"publisher","award":["770784"],"award-info":[{"award-number":["770784"]}],"id":[{"id":"10.13039\/501100000781","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,8,7]]},"DOI":"10.1145\/3528233.3530750","type":"proceedings-article","created":{"date-parts":[[2022,7,20]],"date-time":"2022-07-20T13:56:43Z","timestamp":1658325403000},"page":"1-9","source":"Crossref","is-referenced-by-count":59,"title":["A Motion Matching-based Framework for Controllable Gesture Synthesis from Speech"],"prefix":"10.1145","author":[{"given":"Ikhsanul","family":"Habibie","sequence":"first","affiliation":[{"name":"Department of Visual Computing and Artificial Intelligence, Max Planck Institute for Informatics, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mohamed","family":"Elgharib","sequence":"additional","affiliation":[{"name":"Department of Visual Computing and Artificial Intelligence, Max Planck Institute for Informatics, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kripasindhu","family":"Sarkar","sequence":"additional","affiliation":[{"name":"Department of Visual Computing and Artificial Intelligence, Max Planck Institute for Informatics, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ahsan","family":"Abdullah","sequence":"additional","affiliation":[{"name":"UC Davis, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Simbarashe","family":"Nyatsanga","sequence":"additional","affiliation":[{"name":"UC Davis, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Michael","family":"Neff","sequence":"additional","affiliation":[{"name":"UC Davis, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Christian","family":"Theobalt","sequence":"additional","affiliation":[{"name":"Department of Visual Computing and Artificial Intelligence, Max Planck Institute for Informatics, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2022,8,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Chaitanya Ahuja Dong\u00a0Won Lee Yukiko\u00a0I. Nakano and Louis-Philippe Morency. 2020. Style Transfer for Co-speech Gesture Animation: A Multi-speaker Conditional-Mixture Approach. In European Conference on Computer Vision (ECCV) Andrea Vedaldi Horst Bischof Thomas Brox and Jan-Michael Frahm (Eds.).  Chaitanya Ahuja Dong\u00a0Won Lee Yukiko\u00a0I. Nakano and Louis-Philippe Morency. 2020. Style Transfer for Co-speech Gesture Animation: A Multi-speaker Conditional-Mixture Approach. In European Conference on Computer Vision (ECCV) Andrea Vedaldi Horst Bischof Thomas Brox and Jan-Michael Frahm (Eds.).","DOI":"10.1007\/978-3-030-58523-5_15"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Simon Alexanderson Gustav\u00a0Eje Henter Taras Kucherenko and Jonas Beskow. 2020. Style-controllable speech-driven gesture synthesis using normalising flows. Computer Graphics Forum(2020).  Simon Alexanderson Gustav\u00a0Eje Henter Taras Kucherenko and Jonas Beskow. 2020. Style-controllable speech-driven gesture synthesis using normalising flows. Computer Graphics Forum(2020).","DOI":"10.1111\/cgf.13946"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Okan Arikan and D.\u00a0A. Forsyth. 2002. Interactive Motion Generation from Examples. ACM Trans. Graph. (2002).  Okan Arikan and D.\u00a0A. Forsyth. 2002. Interactive Motion Generation from Examples. ACM Trans. Graph. (2002).","DOI":"10.1145\/566570.566606"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.5555\/3305381.3305404"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475223"},{"key":"e_1_3_2_1_6_1","unstructured":"Michael B\u00fcttner and Simon Clavet.2015. Motion Matching. https:\/\/www.youtube.com\/watch?v=z_wpgHFSWss&.  Michael B\u00fcttner and Simon Clavet.2015. Motion Matching. https:\/\/www.youtube.com\/watch?v=z_wpgHFSWss&."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Zhe Cao Tomas Simon Shih-En Wei and Yaser Sheikh. 2017. Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields. In CVPR.  Zhe Cao Tomas Simon Shih-En Wei and Yaser Sheikh. 2017. Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields. In CVPR.","DOI":"10.1109\/CVPR.2017.143"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/2697.001.0001"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/192161.192272"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of SIGGRAPH","author":"Cassell Justine","year":"2001","unstructured":"Justine Cassell , H. Vilhj\u00e1lmsson , and T. Bickmore . 2001. BEAT: the Behavior Expression Animation Toolkit . In Proceedings of SIGGRAPH 2001 . 477\u2013486. Justine Cassell, H. Vilhj\u00e1lmsson, and T. Bickmore. 2001. BEAT: the Behavior Expression Animation Toolkit. In Proceedings of SIGGRAPH 2001. 477\u2013486."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-23974-8_14"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.5555\/2615731.2615857"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"J.\u00a0S. Chung and A. Zisserman. 2016. Out of time: automated lip sync in the wild. In ACCV.  J.\u00a0S. Chung and A. Zisserman. 2016. Out of time: automated lip sync in the wild. In ACCV.","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Ylva Ferstl Michael Neff and Rachel McDonnell. 2019. Multi-Objective Adversarial Gesture Generation. In Motion Interaction and Games.  Ylva Ferstl Michael Neff and Rachel McDonnell. 2019. Multi-Objective Adversarial Gesture Generation. In Motion Interaction and Games.","DOI":"10.1145\/3359566.3360053"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Ylva Ferstl Michael Neff and Rachel McDonnell. 2020. Understanding the Predictability of Gesture Parameters from Speech and Their Perceptual Importance(Proceedings of the International Conference on Intelligent Virtual Agents).  Ylva Ferstl Michael Neff and Rachel McDonnell. 2020. Understanding the Predictability of Gesture Parameters from Speech and Their Perceptual Importance(Proceedings of the International Conference on Intelligent Virtual Agents).","DOI":"10.1145\/3383652.3423882"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Ylva Ferstl Michael Neff and Rachel McDonnell. 2021. ExpressGesture: Expressive gesture generation from speech through database matching. Computer Animation and Virtual Worlds(2021).  Ylva Ferstl Michael Neff and Rachel McDonnell. 2021. ExpressGesture: Expressive gesture generation from speech through database matching. Computer Animation and Virtual Worlds(2021).","DOI":"10.1002\/cav.2016"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"S. Ginosar A. Bar G. Kohavi C. Chan A. Owens and J. Malik. 2019. Learning Individual Styles of Conversational Gesture. In Computer Vision and Pattern Recognition (CVPR). IEEE.  S. Ginosar A. Bar G. Kohavi C. Chan A. Owens and J. Malik. 2019. Learning Individual Styles of Conversational Gesture. In Computer Vision and Pattern Recognition (CVPR). IEEE.","DOI":"10.1109\/CVPR.2019.00361"},{"key":"e_1_3_2_1_18_1","unstructured":"Ishaan Gulrajani Faruk Ahmed Martin Arjovsky Vincent Dumoulin and Aaron\u00a0C Courville. 2017. Improved Training of Wasserstein GANs. In Advances in Neural Information Processing Systems.  Ishaan Gulrajani Faruk Ahmed Martin Arjovsky Vincent Dumoulin and Aaron\u00a0C Courville. 2017. Improved Training of Wasserstein GANs. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472306.3478335"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267851.3267878"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Daniel Holden Oussama Kanoun Maksym Perepichka and Tiberiu Popa. 2020. Learned Motion Matching. ACM Trans. Graph. (2020).  Daniel Holden Oussama Kanoun Maksym Perepichka and Tiberiu Popa. 2020. Learned Motion Matching. ACM Trans. Graph. (2020).","DOI":"10.1145\/3386569.3392440"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the 32nd International Conference on Machine Learning.","author":"Ioffe Sergey","year":"2015","unstructured":"Sergey Ioffe and Christian Szegedy . 2015 . Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift . In Proceedings of the 32nd International Conference on Machine Learning. Sergey Ioffe and Christian Szegedy. 2015. Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. In Proceedings of the 32nd International Conference on Machine Learning."},{"key":"e_1_3_2_1_23_1","volume-title":"Image-to-Image Translation with Conditional Adversarial Networks. CVPR","author":"Isola Phillip","year":"2017","unstructured":"Phillip Isola , Jun-Yan Zhu , Tinghui Zhou , and Alexei\u00a0 A Efros . 2017. Image-to-Image Translation with Conditional Adversarial Networks. CVPR ( 2017 ). Phillip Isola, Jun-Yan Zhu, Tinghui Zhou, and Alexei\u00a0A Efros. 2017. Image-to-Image Translation with Conditional Adversarial Networks. CVPR (2017)."},{"key":"e_1_3_2_1_24_1","volume-title":"Kingma and Jimmy Ba","author":"P.","year":"2015","unstructured":"Diederik\u00a0 P. Kingma and Jimmy Ba . 2015 . Adam : A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds .). Diederik\u00a0P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.)."},{"key":"e_1_3_2_1_25_1","volume-title":"Motion Graphs. In Proceedings of SIGGRAPH \u201902","author":"Kovar Lucas","year":"2002","unstructured":"Lucas Kovar , Michael Gleicher , and Fr\u00c3\u00a9d\u00c3\u00a9ric Pighin . 2002 . Motion Graphs. In Proceedings of SIGGRAPH \u201902 . San Antonio, TX. Lucas Kovar, Michael Gleicher, and Fr\u00c3\u00a9d\u00c3\u00a9ric Pighin. 2002. Motion Graphs. In Proceedings of SIGGRAPH \u201902. San Antonio, TX."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3382507.3418815"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Taras Kucherenko Patrik Jonell Youngwoo Yoon Pieter Wolfert and Gustav\u00a0Eje Henter. 2021. A Large Crowdsourced Evaluation of Gesture Generation Systems on Common Data: The GENEA Challenge 2020(IUI \u201921).  Taras Kucherenko Patrik Jonell Youngwoo Yoon Pieter Wolfert and Gustav\u00a0Eje Henter. 2021. A Large Crowdsourced Evaluation of Gesture Generation Systems on Common Data: The GENEA Challenge 2020(IUI \u201921).","DOI":"10.1145\/3397481.3450692"},{"key":"e_1_3_2_1_28_1","volume-title":"2019 IEEE\/CVF International Conference on Computer Vision (ICCV).","author":"Lee G.","unstructured":"G. Lee , Z. Deng , S. Ma , T. Shiratori , S. Srinivasa , and Y. Sheikh . 2019. Talking With Hands 16.2M: A Large-Scale Dataset of Synchronized Body-Finger Motion and Audio for Conversational Motion Analysis and Synthesis . In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). G. Lee, Z. Deng, S. Ma, T. Shiratori, S. Srinivasa, and Y. Sheikh. 2019. Talking With Hands 16.2M: A Large-Scale Dataset of Synchronized Body-Finger Motion and Audio for Conversational Motion Analysis and Synthesis. In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Jehee Lee Jinxiang Chai Paul S.\u00a0A. Reitsma Jessica\u00a0K. Hodgins and Nancy\u00a0S. Pollard. 2002. Interactive Control of Avatars Animated with Human Motion Data. ACM Trans. Graph. (2002).  Jehee Lee Jinxiang Chai Paul S.\u00a0A. Reitsma Jessica\u00a0K. Hodgins and Nancy\u00a0S. Pollard. 2002. Interactive Control of Avatars Animated with Human Motion Data. ACM Trans. Graph. (2002).","DOI":"10.1145\/566570.566607"},{"key":"e_1_3_2_1_30_1","volume-title":"Intelligent virtual agents","author":"Lee Jina","unstructured":"Jina Lee and Stacy Marsella . 2006. Nonverbal behavior generator for embodied conversational agents . In Intelligent virtual agents . Springer , 243\u2013255. Jina Lee and Stacy Marsella. 2006. Nonverbal behavior generator for embodied conversational agents. In Intelligent virtual agents. Springer, 243\u2013255."},{"key":"e_1_3_2_1_31_1","unstructured":"Yongjoon Lee Kevin Wampler Gilbert Bernstein Jovan Popovi\u0107 and Zoran Popovi\u0107. 2014. Motion Fields for Interactive Character Locomotion. ACM Trans. Graph. (2014).  Yongjoon Lee Kevin Wampler Gilbert Bernstein Jovan Popovi\u0107 and Zoran Popovi\u0107. 2014. Motion Fields for Interactive Character Locomotion. ACM Trans. Graph. (2014)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Sergey Levine Philipp Kr\u00e4henb\u00fchl Sebastian Thrun and Vladlen Koltun. 2010. Gesture controllers. In SIGGRAPH \u201910.  Sergey Levine Philipp Kr\u00e4henb\u00fchl Sebastian Thrun and Vladlen Koltun. 2010. Gesture controllers. In SIGGRAPH \u201910.","DOI":"10.1145\/1833349.1778861"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Sergey Levine Christian Theobalt and Vladlen Koltun. 2009. Real-Time Prosody-Driven Synthesis of Body Language. In ACM SIGGRAPH Asia 2009 Papers.  Sergey Levine Christian Theobalt and Vladlen Koltun. 2009. Real-Time Prosody-Driven Synthesis of Body Language. In ACM SIGGRAPH Asia 2009 Papers.","DOI":"10.1145\/1661412.1618518"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"S. Mariooryad and C. Busso. 2012. Generating Human-Like Behaviors Using Joint Speech-Driven Models for Conversational Agents. IEEE Transactions on Audio Speech and Language Processing (2012).  S. Mariooryad and C. Busso. 2012. Generating Human-Like Behaviors Using Joint Speech-Driven Models for Conversational Agents. IEEE Transactions on Audio Speech and Language Processing (2012).","DOI":"10.1109\/TASL.2012.2201476"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"David McNeill. 2000. Language and Gesture. Cambridge University Press.  David McNeill. 2000. Language and Gesture. Cambridge University Press.","DOI":"10.1017\/CBO9780511620850"},{"key":"e_1_3_2_1_37_1","unstructured":"Mehdi Mirza and Simon Osindero. 2014. Conditional Generative Adversarial Nets. CoRR abs\/1411.1784(2014).  Mehdi Mirza and Simon Osindero. 2014. Conditional Generative Adversarial Nets. CoRR abs\/1411.1784(2014)."},{"key":"e_1_3_2_1_38_1","unstructured":"Vinod Nair and Geoffrey\u00a0E. Hinton. 2010. Rectified Linear Units Improve Restricted Boltzmann Machines. In ICML.  Vinod Nair and Geoffrey\u00a0E. Hinton. 2010. Rectified Linear Units Improve Restricted Boltzmann Machines. In ICML."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2019.04.005"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Alla Safonova and Jessica\u00a0K. Hodgins. 2007. Construction and Optimal Search of Interpolated Motion Graphs. ACM Trans. Graph. (2007).  Alla Safonova and Jessica\u00a0K. Hodgins. 2007. Construction and Optimal Search of Interpolated Motion Graphs. ACM Trans. Graph. (2007).","DOI":"10.1145\/1275808.1276510"},{"key":"e_1_3_2_1_41_1","volume-title":"IEEE Computer Society Conference on Computer Vision and Pattern Recognition.","author":"Shlizerman Eli","year":"2018","unstructured":"Eli Shlizerman , Lucio Dery , Hayden Schoen , and Ira Kemelmacher-Shlizerman . 2018 . Audio to body dynamics. CVPR , IEEE Computer Society Conference on Computer Vision and Pattern Recognition. Eli Shlizerman, Lucio Dery, Hayden Schoen, and Ira Kemelmacher-Shlizerman. 2018. Audio to body dynamics. CVPR, IEEE Computer Society Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3072959.3073697","article-title":"Understanding the impact of animated gesture performance on personality perceptions","volume":"36","author":"Smith Harrison\u00a0Jesse","year":"2017","unstructured":"Harrison\u00a0Jesse Smith and Michael Neff . 2017 . Understanding the impact of animated gesture performance on personality perceptions . ACM Transactions on Graphics (TOG) 36 , 4 (2017), 1 \u2013 12 . Harrison\u00a0Jesse Smith and Michael Neff. 2017. Understanding the impact of animated gesture performance on personality perceptions. ACM Transactions on Graphics (TOG) 36, 4 (2017), 1\u201312.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Youngwoo Yoon Bok Cha Joo-Haeng Lee Minsu Jang Jaeyeon Lee Jaehong Kim and Geehyuk Lee. 2020. Speech Gesture Generation from the Trimodal Context of Text Audio and Speaker Identity. ACM Transactions on Graphics(2020).  Youngwoo Yoon Bok Cha Joo-Haeng Lee Minsu Jang Jaeyeon Lee Jaehong Kim and Geehyuk Lee. 2020. Speech Gesture Generation from the Trimodal Context of Text Audio and Speaker Identity. ACM Transactions on Graphics(2020).","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793720"},{"key":"e_1_3_2_1_45_1","volume-title":"SGToolkit: An Interactive Gesture Authoring Toolkit for Embodied Conversational Agents. In The 34th Annual ACM Symposium on User Interface Software and Technology(UIST \u201921)","author":"Yoon Youngwoo","year":"2021","unstructured":"Youngwoo Yoon , Keunwoo Park , Minsu Jang , Jaehong Kim , and Geehyuk Lee . 2021 . SGToolkit: An Interactive Gesture Authoring Toolkit for Embodied Conversational Agents. In The 34th Annual ACM Symposium on User Interface Software and Technology(UIST \u201921) . Association for Computing Machinery. Youngwoo Yoon, Keunwoo Park, Minsu Jang, Jaehong Kim, and Geehyuk Lee. 2021. SGToolkit: An Interactive Gesture Authoring Toolkit for Embodied Conversational Agents. In The 34th Annual ACM Symposium on User Interface Software and Technology(UIST \u201921). Association for Computing Machinery."}],"event":{"name":"SIGGRAPH '22: Special Interest Group on Computer Graphics and Interactive Techniques Conference","location":"Vancouver BC Canada","acronym":"SIGGRAPH '22","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Special Interest Group on Computer Graphics and Interactive Techniques Conference Proceedings"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3528233.3530750","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:42Z","timestamp":1750186962000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3528233.3530750"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,7]]},"references-count":45,"alternative-id":["10.1145\/3528233.3530750","10.1145\/3528233"],"URL":"https:\/\/doi.org\/10.1145\/3528233.3530750","relation":{},"subject":[],"published":{"date-parts":[[2022,8,7]]}}}