{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T05:21:23Z","timestamp":1779340883194,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":75,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the Ng Teng Fong Charitable Foundation in the form of ZJU-SUTD IDEA Grant","award":["188170-11102"],"award-info":[{"award-number":["188170-11102"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611705","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"3538-3549","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Cultural Self-Adaptive Multimodal Gesture Generation Based on Multiple Culture Gesture Dataset"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2586-330X","authenticated-orcid":false,"given":"Jingyu","family":"Wu","sequence":"first","affiliation":[{"name":"College of Computer Science and Technology, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3577-5725","authenticated-orcid":false,"given":"Shi","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang-Singapore Innovation and AI Joint Research Lab, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2030-3393","authenticated-orcid":false,"given":"Shuyu","family":"Gan","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6068-5185","authenticated-orcid":false,"given":"Weijun","family":"Li","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4925-5250","authenticated-orcid":false,"given":"Changyuan","family":"Yang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5561-0493","authenticated-orcid":false,"given":"Lingyun","family":"Sun","sequence":"additional","affiliation":[{"name":"Zhejiang-Singapore Innovation and AI Joint Research Lab, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Yukiko I Nakano, and Louis-Philippe Morency.","author":"Ahuja Chaitanya","year":"2020","unstructured":"Chaitanya Ahuja, Dong Won Lee, Yukiko I Nakano, and Louis-Philippe Morency. 2020. Style transfer for co-speech gesture animation: A multi-speaker conditional-mixture approach. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XVIII 16. Springer, 248--265."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2019.00084"},{"key":"e_1_3_2_1_3_1","volume-title":"Taras Kucherenko, and Jonas Beskow.","author":"Alexanderson Simon","year":"2020","unstructured":"Simon Alexanderson, Gustav Eje Henter, Taras Kucherenko, and Jonas Beskow. 2020a. Style-Controllable Speech-Driven Gesture Synthesis Using Normalising Flows. In Computer Graphics Forum, Vol. 39. Wiley Online Library, 487--496."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383652.3423874"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1002\/cav.1944"},{"key":"e_1_3_2_1_6_1","volume-title":"Gesture and the nature of language","author":"Armstrong David F","unstructured":"David F Armstrong, William C Stokoe, and Sherman E Wilcox. 1995. Gesture and the nature of language. Cambridge University Press."},{"key":"e_1_3_2_1_7_1","volume-title":"An empirical evaluation of generic convolutional and recurrent networks for sequence modeling. arXiv preprint arXiv:1803.01271","author":"Bai Shaojie","year":"2018","unstructured":"Shaojie Bai, J Zico Kolter, and Vladlen Koltun. 2018. An empirical evaluation of generic convolutional and recurrent networks for sequence modeling. arXiv preprint arXiv:1803.01271 (2018)."},{"key":"e_1_3_2_1_8_1","volume-title":"Text2gestures: A transformer-based network for generating emotive body gestures for virtual agents. In 2021 IEEE virtual reality and 3D user interfaces (VR)","author":"Bhattacharya Uttaran","unstructured":"Uttaran Bhattacharya, Nicholas Rewkowski, Abhishek Banerjee, Pooja Guhan, Aniket Bera, and Dinesh Manocha. 2021. Text2gestures: A transformer-based network for generating emotive body gestures for virtual agents. In 2021 IEEE virtual reality and 3D user interfaces (VR). IEEE, 1--10."},{"key":"e_1_3_2_1_9_1","volume-title":"Does language shape thought?: Mandarin and English speakers' conceptions of time. Cognitive psychology","author":"Boroditsky Lera","year":"2001","unstructured":"Lera Boroditsky. 2001. Does language shape thought?: Mandarin and English speakers' conceptions of time. Cognitive psychology, Vol. 43, 1 (2001), 1--22."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/Humanoids.2011.6100810"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2015.2493525"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.143"},{"key":"e_1_3_2_1_13_1","volume-title":"Speech-gesture mismatches: Evidence for one underlying representation of linguistic and nonlinguistic information. Pragmatics & cognition","author":"Cassell Justine","year":"1999","unstructured":"Justine Cassell, David McNeill, and Karl-Erik McCullough. 1999. Speech-gesture mismatches: Evidence for one underlying representation of linguistic and nonlinguistic information. Pragmatics & cognition, Vol. 7, 1 (1999), 1--34."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00093"},{"key":"e_1_3_2_1_15_1","article-title":"Nonverbal Behaviour as Communication: The Arabian Coffee Making Ritual","volume":"5","author":"Darweesh Lena","year":"2010","unstructured":"Lena Darweesh. 2010. Nonverbal Behaviour as Communication: The Arabian Coffee Making Ritual. International Journal of Interdisciplinary Social Sciences, Vol. 5, 7 (2010).","journal-title":"International Journal of Interdisciplinary Social Sciences"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-021-09534-8"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267851.3267898"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2020.04.007"},{"key":"e_1_3_2_1_19_1","volume-title":"Nataliya Berbyuk Lindstr\u00f6m, and Pierre Gander","author":"Gander Anna Jia","year":"2021","unstructured":"Anna Jia Gander, Nataliya Berbyuk Lindstr\u00f6m, and Pierre Gander. 2021. Expressing Agreement in Swedish and Chinese: A Case Study of Communicative Feedback in First-Time Encounters. In Cross-Cultural Design. Experience and Product Design Across Cultures: 13th International Conference, CCD 2021, Held as Part of the 23rd HCI International Conference, HCII 2021, Virtual Event, July 24--29, 2021, Proceedings, Part I 23. Springer, 390--407."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.265"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00361"},{"key":"e_1_3_2_1_22_1","volume-title":"Learning word vectors for 157 languages. arXiv preprint arXiv:1802.06893","author":"Grave Edouard","year":"2018","unstructured":"Edouard Grave, Piotr Bojanowski, Prakhar Gupta, Armand Joulin, and Tomas Mikolov. 2018. Learning word vectors for 157 languages. arXiv preprint arXiv:1802.06893 (2018)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472306.3478335"},{"key":"e_1_3_2_1_24_1","unstructured":"Edward T Hall. 1976. Beyond culture. Anchor."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417836"},{"key":"e_1_3_2_1_27_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_28_1","volume-title":"Long short-term memory. Neural computation","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation, Vol. 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4612-4380-9_35"},{"key":"e_1_3_2_1_30_1","volume-title":"On the \"steerability\" of generative adversarial networks. arXiv preprint arXiv:1907.07171","author":"Jahanian Ali","year":"2019","unstructured":"Ali Jahanian, Lucy Chai, and Phillip Isola. 2019. On the \"steerability\" of generative adversarial networks. arXiv preprint arXiv:1907.07171 (2019)."},{"key":"e_1_3_2_1_31_1","volume-title":"zip: Compressing text classification models. arXiv preprint arXiv:1612.03651","author":"Joulin Armand","year":"2016","unstructured":"Armand Joulin, Edouard Grave, Piotr Bojanowski, Matthijs Douze, H\u00e9rve J\u00e9gou, and Tomas Mikolov. 2016b. Fasttext. zip: Compressing text classification models. arXiv preprint arXiv:1612.03651 (2016)."},{"key":"e_1_3_2_1_32_1","volume-title":"Bag of tricks for efficient text classification. arXiv preprint arXiv:1607.01759","author":"Joulin Armand","year":"2016","unstructured":"Armand Joulin, Edouard Grave, Piotr Bojanowski, and Tomas Mikolov. 2016a. Bag of tricks for efficient text classification. arXiv preprint arXiv:1607.01759 (2016)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.33166\/AETiC.2021.05.006"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511807572"},{"key":"e_1_3_2_1_35_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_36_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1080\/01690960802586188"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1353\/sls.2012.0027"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 763--772","author":"Lee Gilwoo","year":"2019","unstructured":"Gilwoo Lee, Zhiwei Deng, Shugao Ma, Takaaki Shiratori, Siddhartha S Srinivasa, and Yaser Sheikh. 2019. Talking with hands 16.2 m: A large-scale dataset of synchronized body-finger motion and audio for conversational motion analysis and synthesis. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 763--772."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"e_1_3_2_1_42_1","volume-title":"Tel Aviv","author":"Liu Haiyang","year":"2022","unstructured":"Haiyang Liu, Zihao Zhu, Naoya Iwamoto, Yichen Peng, Zhengqing Li, You Zhou, Elif Bozkurt, and Bo Zheng. 2022b. BEAT: A Large-Scale Semantic and Emotional Multi-Modal Dataset for Conversational Gestures Synthesis. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part VII. Springer, 612--630."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2017.2736999"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414660"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3558059"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the 18th Nordic Conference of Computational Linguistics (NODALIDA","author":"Navarretta Costanza","year":"2011","unstructured":"Costanza Navarretta, Elisabeth Ahls\u00e9n, Jens Allwood, Kristiina Jokinen, and Patrizia Paggio. 2011. Creating comparable multimodal corpora for nordic languages. In Proceedings of the 18th Nordic Conference of Computational Linguistics (NODALIDA 2011). 153--160."},{"key":"e_1_3_2_1_49_1","volume-title":"With the future behind them: Convergent evidence from Aymara language and gesture in the crosslinguistic comparison of spatial construals of time. Cognitive science","author":"N\u00fanez Rafael E","year":"2006","unstructured":"Rafael E N\u00fanez and Eve Sweetser. 2006. With the future behind them: Convergent evidence from Aymara language and gesture in the crosslinguistic comparison of spatial construals of time. Cognitive science, Vol. 30, 3 (2006), 401--450."},{"key":"e_1_3_2_1_50_1","volume-title":"Feedback and gestural behaviour in a conversational corpus of Danish","author":"Paggio Patrizia","year":"2011","unstructured":"Patrizia Paggio and Costanza Navarretta. 2011. Feedback and gestural behaviour in a conversational corpus of Danish. NEALT (Northen European Association of Language Technology) Proceedings Series (2011), 33--39."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00794"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC'16)","author":"Popescu Vladimir","year":"2016","unstructured":"Vladimir Popescu, Lin Liu, Riccardo Del Gratta, Khalid Choukri, and Nicoletta Calzolari. 2016. New developments in the LRE map. In Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC'16). 4526--4530."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01089"},{"key":"e_1_3_2_1_54_1","volume-title":"International conference on machine learning. PMLR, 1278--1286","author":"Rezende Danilo Jimenez","year":"2014","unstructured":"Danilo Jimenez Rezende, Shakir Mohamed, and Daan Wierstra. 2014. Stochastic backpropagation and approximate inference in deep generative models. In International conference on machine learning. PMLR, 1278--1286."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"David E Rumelhart Geoffrey E Hinton and Ronald J Williams. 1985. Learning internal representations by error propagation. Technical Report. California Univ San Diego La Jolla Inst for Cognitive Science.","DOI":"10.21236\/ADA164453"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_43"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1177\/002383099403700208"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3125739.3132594"},{"key":"e_1_3_2_1_59_1","unstructured":"Emmi Toivio and Kristiina Jokinen. 2012. Multimodal Feedback Signaling in Finnish.. In Baltic HLT. 247--255."},{"key":"e_1_3_2_1_60_1","first-page":"7","article-title":"How are gestures used by politicians? A multimodal co-gesture analysis","volume":"7","author":"Trotta Daniela","year":"2021","unstructured":"Daniela Trotta and Raffaele Guarasci. 2021. How are gestures used by politicians? A multimodal co-gesture analysis. IJCoL. Italian Journal of Computational Linguistics, Vol. 7, 7--1, 2 (2021), 45--66.","journal-title":"IJCoL. Italian Journal of Computational Linguistics"},{"key":"e_1_3_2_1_61_1","unstructured":"Aaron Van Den Oord Oriol Vinyals et al. 2017. Neural discrete representation learning. Advances in neural information processing systems Vol. 30 (2017)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3382507.3418872"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0113647"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2013.09.008"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-70022-9_33"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.3390\/electronics10030228"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3461615.3485407"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1080\/10447318.2023.2228530"},{"key":"e_1_3_2_1_69_1","volume-title":"Preserving Structural Consistency in Arbitrary Artist and Artwork Style Transfer. In Proceedings of the AAAI Conference on Artificial Intelligence","volume":"37","author":"Wu Jingyu","year":"2023","unstructured":"Jingyu Wu, Lefan Hou, Zejian Li, Jun Liao, Li Liu, and Lingyun Sun. 2023 b. Preserving Structural Consistency in Arbitrary Artist and Artwork Style Transfer. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 37. 2830--2838."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1083-6101.2006.tb00313.x"},{"key":"e_1_3_2_1_71_1","volume-title":"Videogpt: Video generation using vq-vae and transformers. arXiv preprint arXiv:2104.10157","author":"Yan Wilson","year":"2021","unstructured":"Wilson Yan, Yunzhi Zhang, Pieter Abbeel, and Aravind Srinivas. 2021. Videogpt: Video generation using vq-vae and transformers. arXiv preprint arXiv:2104.10157 (2021)."},{"key":"e_1_3_2_1_72_1","volume-title":"Articulated human detection with flexible mixtures of parts","author":"Yang Yi","year":"2012","unstructured":"Yi Yang and Deva Ramanan. 2012. Articulated human detection with flexible mixtures of parts. IEEE transactions on pattern analysis and machine intelligence, Vol. 35, 12 (2012), 2878--2890."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793720"},{"key":"e_1_3_2_1_75_1","unstructured":"Yu Zhang Wei Han James Qin Yongqiang Wang Ankur Bapna Zhehuai Chen Nanxin Chen Bo Li Vera Axelrod Gary Wang et al. 2023. Google usm: Scaling automatic speech recognition beyond 100 languages. arXiv preprint arXiv:2303.01037 (2023)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611705","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611705","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:06:30Z","timestamp":1755821190000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611705"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":75,"alternative-id":["10.1145\/3581783.3611705","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611705","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}