{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:42:51Z","timestamp":1777653771791,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,9,19]],"date-time":"2023-09-19T00:00:00Z","timestamp":1695081600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,9,19]]},"DOI":"10.1145\/3570945.3607337","type":"proceedings-article","created":{"date-parts":[[2023,12,22]],"date-time":"2023-12-22T06:07:02Z","timestamp":1703225222000},"page":"1-8","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Augmented Co-Speech Gesture Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-3646-7702","authenticated-orcid":false,"given":"Hendric","family":"Vo\u00df","sequence":"first","affiliation":[{"name":"Social Cognitive Systems Group, Bielefeld University, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4047-9277","authenticated-orcid":false,"given":"Stefan","family":"Kopp","sequence":"additional","affiliation":[{"name":"Social Cognitive Systems Group, Bielefeld University, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,12,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings, Part XVIII 16","author":"Ahuja Chaitanya","year":"2020","unstructured":"Chaitanya Ahuja, Dong Won Lee, Yukiko I Nakano, and Louis-Philippe Morency. 2020. Style transfer for co-speech gesture animation: A multi-speaker conditional-mixture approach. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XVIII 16. Springer, 248--265."},{"key":"e_1_3_2_1_2_1","volume-title":"Taras Kucherenko, and Jonas Beskow.","author":"Alexanderson Simon","year":"2020","unstructured":"Simon Alexanderson, Gustav Eje Henter, Taras Kucherenko, and Jonas Beskow. 2020. Style-Controllable Speech-Driven Gesture Synthesis Using Normalising Flows. In Computer Graphics Forum, Vol. 39. Wiley Online Library, 487--496."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555435"},{"key":"e_1_3_2_1_4_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33 (2020), 12449--12460."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-04380-2_12"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/344779.344865"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/383259.383315"},{"key":"e_1_3_2_1_8_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_9_1","volume-title":"AlphaPose: Whole-Body Regional Multi-Person Pose Estimation and Tracking in Real-Time","author":"Fang Hao-Shu","year":"2022","unstructured":"Hao-Shu Fang, Jiefeng Li, Hongyang Tang, Chao Xu, Haoyi Zhu, Yuliang Xiu, Yong-Lu Li, and Cewu Lu. 2022. AlphaPose: Whole-Body Regional Multi-Person Pose Estimation and Tracking in Real-Time. IEEE Transactions on Pattern Analysis and Machine Intelligence (2022)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267851.3267898"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3359566.3360053"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.socec.2010.10.008"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1006\/cviu.2000.0894"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3558068"},{"key":"e_1_3_2_1_15_1","volume-title":"Zeroeggs: Zero-shot example-based gesture generation from speech. arXiv preprint arXiv:2209.07556","author":"Ghorbani Saeed","year":"2022","unstructured":"Saeed Ghorbani, Ylva Ferstl, Daniel Holden, Nikolaus F Troje, and Marc-Andr\u00e9 Carbonneau. 2022. Zeroeggs: Zero-shot example-based gesture generation from speech. arXiv preprint arXiv:2209.07556 (2022)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00361"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530750"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417836"},{"key":"e_1_3_2_1_19_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/11821830_17"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472306.3478333"},{"key":"e_1_3_2_1_22_1","volume-title":"Dancing to music. Advances in neural information processing systems 32","author":"Lee Hsin-Ying","year":"2019","unstructured":"Hsin-Ying Lee, Xiaodong Yang, Ming-Yu Liu, Ting-Chun Wang, Yu-Ding Lu, Ming-Hsuan Yang, and Jan Kautz. 2019. Dancing to music. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01022"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392422"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"e_1_3_2_1_26_1","volume-title":"LREC 2010 workshop: Multimodal corpora--advances in capturing, coding and analyzing multimodality.","author":"L\u00fccking Andy","year":"2010","unstructured":"Andy L\u00fccking, Kirsten Bergmann, Florian Hahn, Stefan Kopp, and Hannes Rieser. 2010. The Bielefeld speech and gesture alignment corpus (SaGA). In LREC 2010 workshop: Multimodal corpora--advances in capturing, coding and analyzing multimodality."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/1330511.1330516"},{"key":"e_1_3_2_1_28_1","volume-title":"Gustav Eje Henter, and Michael Neff","author":"Nyatsanga Simbarashe","year":"2023","unstructured":"Simbarashe Nyatsanga, Taras Kucherenko, Chaitanya Ahuja, Gustav Eje Henter, and Michael Neff. 2023. A Comprehensive Review of Data-Driven Co-Speech Gesture Generation. arXiv preprint arXiv:2301.05339 (2023)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/MRA.2018.2833157"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00794"},{"key":"e_1_3_2_1_31_1","volume-title":"Aaron Van den Oord, and Oriol Vinyals","author":"Razavi Ali","year":"2019","unstructured":"Ali Razavi, Aaron Van den Oord, and Oriol Vinyals. 2019. Generating diverse high-fidelity images with vq-vae-2. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_32_1","volume-title":"Yolov3: An incremental improvement. arXiv preprint arXiv:1804.02767","author":"Redmon Joseph","year":"2018","unstructured":"Joseph Redmon and Ali Farhadi. 2018. Yolov3: An incremental improvement. arXiv preprint arXiv:1804.02767 (2018)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2018.03.006"},{"key":"e_1_3_2_1_34_1","unstructured":"TED. [n.d.]. TED --- youtube.com. https:\/\/www.youtube.com\/c\/TED\/videos. [Accessed 16-Feb-2023]."},{"key":"e_1_3_2_1_35_1","unstructured":"TEDx. [n.d.]. TEDx Talks --- youtube.com. https:\/\/www.youtube.com\/channel\/UCsT0YIqwnpJCM-mx7-gSA4Q. [Accessed 16-Feb-2023]."},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the Workshop on Balanced Perception and Action in ECAs at AAMAS","volume":"4","author":"Tepper Paul","year":"2004","unstructured":"Paul Tepper, Stefan Kopp, and Justine Cassell. 2004. Content in context: Generating language and iconic gesture without a gestionary. In Proceedings of the Workshop on Balanced Perception and Action in ECAs at AAMAS, Vol. 4. 8."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Hendric Vo\u00df and Stefan Kopp. 2023. AQ-GT: a Temporally Aligned and Quantized GRU-Transformer for Co-Speech Gesture Synthesis. arXiv:2305.01241 [cs.HC]","DOI":"10.1145\/3577190.3614135"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472749.3474789"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-35888-4_59"},{"key":"e_1_3_2_1_41_1","volume-title":"Mediapipe hands: On-device real-time hand tracking. arXiv preprint arXiv:2006.10214","author":"Zhang Fan","year":"2020","unstructured":"Fan Zhang, Valentin Bazarevsky, Andrey Vakunov, Andrei Tkachenka, George Sung, Chuo-Ling Chang, and Matthias Grundmann. 2020. Mediapipe hands: On-device real-time hand tracking. arXiv preprint arXiv:2006.10214 (2020)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00363"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Chi Zhou Tengyue Bian and Kang Chen. 2022. GestureMaster: Graph-based Speech-driven Gesture Generation. In INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION. ACM Bengaluru India 764--770. https:\/\/doi.org\/10\/gsd2t6","DOI":"10.1145\/3536221.3558063"}],"event":{"name":"IVA '23: ACM International Conference on Intelligent Virtual Agents","location":"W\u00fcrzburg Germany","acronym":"IVA '23","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence"]},"container-title":["Proceedings of the 23rd ACM International Conference on Intelligent Virtual Agents"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3570945.3607337","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3570945.3607337","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T14:26:49Z","timestamp":1755872809000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3570945.3607337"}},"subtitle":["Including Form and Meaning Features to Guide Learning-Based Gesture Synthesis"],"short-title":[],"issued":{"date-parts":[[2023,9,19]]},"references-count":43,"alternative-id":["10.1145\/3570945.3607337","10.1145\/3570945"],"URL":"https:\/\/doi.org\/10.1145\/3570945.3607337","relation":{},"subject":[],"published":{"date-parts":[[2023,9,19]]},"assertion":[{"value":"2023-12-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}