{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,16]],"date-time":"2025-10-16T00:54:41Z","timestamp":1760576081152,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T00:00:00Z","timestamp":1730678400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,4]]},"DOI":"10.1145\/3686215.3688819","type":"proceedings-article","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T12:17:01Z","timestamp":1730290621000},"page":"138-146","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Towards interpretable co-speech gestures synthesis using STARGATE"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-6799-8008","authenticated-orcid":false,"given":"Louis","family":"Abel","sequence":"first","affiliation":[{"name":"Universit\u00e9 de Lorraine, CNRS, Inria, LORIA, F-54000, France"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5040-6971","authenticated-orcid":false,"given":"Vincent","family":"Colotte","sequence":"additional","affiliation":[{"name":"Universit\u00e9 de Lorraine, CNRS, Inria, LORIA, F-54000, France"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5286-7368","authenticated-orcid":false,"given":"Slim","family":"Ouni","sequence":"additional","affiliation":[{"name":"Universit\u00e9 de Lorraine, CNRS, Inria, LORIA, F-54000, France"}]}],"member":"320","published-online":{"date-parts":[[2024,11,4]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-302"},{"volume-title":"Computer Graphics Forum, Vol.\u00a039","author":"Alexanderson Simon","key":"e_1_3_2_2_2_1","unstructured":"Simon Alexanderson, Gustav\u00a0Eje Henter, Taras Kucherenko, and Jonas Beskow. 2020. Style-Controllable Speech-Driven Gesture Synthesis Using Normalising Flows. In Computer Graphics Forum, Vol.\u00a039. Wiley Online Library, 487\u2013496."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592458"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555435"},{"key":"e_1_3_2_2_5_1","volume-title":"Une morphologie de la gestualit\u00e9 : structuration articulaire. Cahiers de linguistique analogique5 (Dec","author":"Boutet Dominique","year":"2008","unstructured":"Dominique Boutet. 2008. Une morphologie de la gestualit\u00e9 : structuration articulaire. Cahiers de linguistique analogique5 (Dec. 2008), 81\u2013115. https:\/\/hal.science\/hal-00607593"},{"key":"e_1_3_2_2_6_1","volume-title":"Embodied conversational agents: representation and intelligence in user interfaces. AI magazine 22, 4","author":"Cassell Justine","year":"2001","unstructured":"Justine Cassell. 2001. Embodied conversational agents: representation and intelligence in user interfaces. AI magazine 22, 4 (2001), 67\u201367."},{"key":"e_1_3_2_2_7_1","unstructured":"Kyunghyun Cho Bart Merrienboer Caglar Gulcehre Fethi Bougares Holger Schwenk and Yoshua Bengio. 2014. Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation. In EMNLP."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1756-8765.2012.01183.x"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577190.3616117"},{"key":"e_1_3_2_2_10_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_11_1","volume-title":"Workshop on Multimodal Corpora, Vol.\u00a06. 86\u201391","author":"Ferr\u00e9 Ga\u00eblle","year":"2010","unstructured":"Ga\u00eblle Ferr\u00e9. 2010. Timing relationships between speech and co-verbal gestures in spontaneous French. In Language Resources and Evaluation, Workshop on Multimodal Corpora, Vol.\u00a06. 86\u201391."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1080\/10867651.1998.10487493"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417836"},{"key":"e_1_3_2_2_14_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)."},{"volume-title":"Breakthroughs in statistics: Methodology and distribution","author":"Huber J","key":"e_1_3_2_2_15_1","unstructured":"Peter\u00a0J Huber. 1992. Robust estimation of a location parameter. In Breakthroughs in statistics: Methodology and distribution. Springer, 492\u2013518."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1075\/gest.00003.jen"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511807572"},{"key":"e_1_3_2_2_18_1","volume-title":"Semi-Supervised Classification with Graph Convolutional Networks. arXiv preprint arXiv:1609.02907","author":"Kipf N","year":"2016","unstructured":"Thomas\u00a0N Kipf and Max Welling. 2016. Semi-Supervised Classification with Graph Convolutional Networks. arXiv preprint arXiv:1609.02907 (2016)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1093\/acprof:oso\/9780198524519.003.0006"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308532.3329472"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3382507.3418815"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397481.3450692"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_36"},{"key":"e_1_3_2_2_25_1","volume-title":"Co-Speech Gesture Synthesis using Discrete Gesture Token Learning. arXiv preprint arXiv:2303.12822","author":"Lu Shuhong","year":"2023","unstructured":"Shuhong Lu, Youngwoo Yoon, and Andrew Feng. 2023. Co-Speech Gesture Synthesis using Discrete Gesture Token Learning. arXiv preprint arXiv:2303.12822 (2023)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"crossref","unstructured":"Michael McAuliffe Michaela Socolof Sarah Mihuc Michael Wagner and Morgan Sonderegger. 2017. Montreal forced aligner: Trainable text-speech alignment using kaldi.. In Interspeech Vol.\u00a02017. 498\u2013502.","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"e_1_3_2_2_27_1","volume-title":"Hearing lips and seeing voices. Nature 264, 5588","author":"McGurk Harry","year":"1976","unstructured":"Harry McGurk and John MacDonald. 1976. Hearing lips and seeing voices. Nature 264, 5588 (1976), 746\u2013748."},{"volume-title":"Hand and Mind: What Gestures Reveal about Thought","author":"McNeill David","key":"e_1_3_2_2_28_1","unstructured":"David McNeill. 1992. Hand and Mind: What Gestures Reveal about Thought. The University of Chicago Press, Chicago and London."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1016\/S1389-0417(02)00056-6"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1551-6709.2008.01006.x"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-58750-9_28"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-27077-2_18"},{"key":"e_1_3_2_2_34_1","volume-title":"DiffuGesture: Generating Human Gesture From Two-person Dialogue With Diffusion Models. In Companion Publication of the 25th International Conference on Multimodal Interaction. 179\u2013185","author":"Zhao Weiyu","year":"2023","unstructured":"Weiyu Zhao, Liangxiao Hu, and Shengping Zhang. 2023. DiffuGesture: Generating Human Gesture From Two-person Dialogue With Diffusion Models. In Companion Publication of the 25th International Conference on Multimodal Interaction. 179\u2013185."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR52148.2021.00018"}],"event":{"name":"ICMI '24: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"],"location":"San Jose Costa Rica","acronym":"ICMI '24"},"container-title":["Companion Proceedings of the 26th International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3686215.3688819","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3686215.3688819","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,15]],"date-time":"2025-10-15T16:22:22Z","timestamp":1760545342000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3686215.3688819"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,4]]},"references-count":35,"alternative-id":["10.1145\/3686215.3688819","10.1145\/3686215"],"URL":"https:\/\/doi.org\/10.1145\/3686215.3688819","relation":{},"subject":[],"published":{"date-parts":[[2024,11,4]]},"assertion":[{"value":"2024-11-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}