{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,13]],"date-time":"2026-06-13T07:23:42Z","timestamp":1781335422735,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":119,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T00:00:00Z","timestamp":1751587200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["1839971"],"award-info":[{"award-number":["1839971"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,5]]},"DOI":"10.1145\/3715336.3735769","type":"proceedings-article","created":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T10:09:55Z","timestamp":1751623795000},"page":"59-80","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":23,"title":["GesPrompt: Leveraging Co-Speech Gestures to Augment LLM-Based Interaction in Virtual Reality"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9497-6925","authenticated-orcid":false,"given":"Xiyun","family":"Hu","sequence":"first","affiliation":[{"name":"School of Mechanical Engineering, Purdue University, West Lafayette, Indiana, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1711-4930","authenticated-orcid":false,"given":"Dizhi","family":"Ma","sequence":"additional","affiliation":[{"name":"Elmore Family School of Electrical and Computer Engineering, Purdue University, West Lafayette, Indiana, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8558-8848","authenticated-orcid":false,"given":"Fengming","family":"He","sequence":"additional","affiliation":[{"name":"Elmore Family School of Electrical and Computer Engineering, Purdue University, West Lafayette, Indiana, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9935-0518","authenticated-orcid":false,"given":"Zhengzhe","family":"Zhu","sequence":"additional","affiliation":[{"name":"Elmore Family School of Electrical and Computer Engineering, Purdue University, West Lafayette, Indiana, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7893-5613","authenticated-orcid":false,"given":"Shao-Kang","family":"Hsia","sequence":"additional","affiliation":[{"name":"School of Mechanical Engineering, Purdue University, West Lafayette, Indiana, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3408-2876","authenticated-orcid":false,"given":"Chenfei","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of Mechanical Engineering, Purdue University, West Lafayette, Indiana, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1270-2734","authenticated-orcid":false,"given":"Ziyi","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Mechanical Engineering, Purdue University, West Lafayette, Indiana, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8639-5135","authenticated-orcid":false,"given":"Karthik","family":"Ramani","sequence":"additional","affiliation":[{"name":"School of Mechanical Engineering, Purdue University, West Lafayette, Indiana, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,7,4]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2024. Hololens 2. https:\/\/www.microsoft.com\/en-us\/hololens."},{"key":"e_1_3_3_2_3_2","unstructured":"2024. Leap Motion. https:\/\/www.ultraleap.com\/."},{"key":"e_1_3_3_2_4_2","unstructured":"2024. Meta Quest Pro. https:\/\/www.meta.com\/quest\/quest-pro\/."},{"key":"e_1_3_3_2_5_2","unstructured":"2024. Microsoft Azure Speech to Text. https:\/\/azure.microsoft.com\/en-us\/products\/ai-services\/ai-speech."},{"key":"e_1_3_3_2_6_2","unstructured":"2024. Microsoft Planner. https:\/\/tasks.office.com\/. Accessed: 2024-09-11."},{"key":"e_1_3_3_2_7_2","unstructured":"2024. Unity Engine. https:\/\/unity.com\/."},{"key":"e_1_3_3_2_8_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3643834.3661547"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586182.3616693"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Muneer Al-Hammadi Ghulam Muhammad Wadood Abdul Mansour Alsulaiman Mohammed\u00a0A Bencherif Tareq\u00a0S Alrayes Hassan Mathkour and Mohamed\u00a0Amine Mekhtiche. 2020. Deep learning-based approach for sign language gesture recognition with efficient hand gesture representation. Ieee Access 8 (2020) 192527\u2013192542.","DOI":"10.1109\/ACCESS.2020.3032140"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Martha\u00a0W Alibali. 2005. Gesture in spatial cognition: Expressing communicating and thinking about spatial information. Spatial cognition and computation 5 4 (2005) 307\u2013331.","DOI":"10.1207\/s15427633scc0504_2"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Tenglong Ao Qingzhe Gao Yuke Lou Baoquan Chen and Libin Liu. 2022. Rhythmic gesticulator: Rhythm-aware co-speech gesture synthesis with hierarchical neural embeddings. ACM Transactions on Graphics (TOG) 41 6 (2022) 1\u201319.","DOI":"10.1145\/3550454.3555435"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3332165.3347942"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/261135.261163"},{"key":"e_1_3_3_2_16_2","first-page":"262","volume-title":"Proceedings of the 7th annual conference on Computer graphics and interactive techniques","author":"Bolt Richard\u00a0A","year":"1980","unstructured":"Richard\u00a0A Bolt. 1980. \u201cPut-that-there\u201d Voice and gesture at the graphics interface. In Proceedings of the 7th annual conference on Computer graphics and interactive techniques. 262\u2013270."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3640794.3665563"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"crossref","unstructured":"Sara\u00a0C Broaders and Susan Goldin-Meadow. 2010. Truth is at hand: How gesture adds information during investigative interviews. Psychological Science 21 5 (2010) 623\u2013628.","DOI":"10.1177\/0956797610366082"},{"key":"e_1_3_3_2_19_2","unstructured":"Tom\u00a0B Brown. 2020. Language models are few-shot learners. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2005.14165 (2020)."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/988834.988871"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Judee\u00a0K Burgoon Thomas Birk and Michael Pfau. 1990. Nonverbal behaviors persuasion and credibility. Human communication research 17 1 (1990) 140\u2013169.","DOI":"10.1111\/j.1468-2958.1990.tb00229.x"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3242587.3242631"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/2380116.2380171"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00468"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/SMC.2017.8122603"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/AIxVR59861.2024.00011"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"crossref","unstructured":"Sharice Clough and Melissa\u00a0C Duff. 2020. The role of gesture in communication and cognition: Implications for understanding and treating neurogenic communication disorders. Frontiers in Human Neuroscience 14 (2020) 323.","DOI":"10.3389\/fnhum.2020.00323"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"H\u00e9l\u00e8ne Cochet and Jacques Vauclair. 2014. Deictic gestures and symbolic gestures produced by adults in an experimental context: Hand shapes and hand preferences. Laterality: Asymmetries of Body Brain and Cognition 19 3 (2014) 278\u2013301.","DOI":"10.1080\/1357650X.2013.804079"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642579"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Holger Diessel and Kenny\u00a0R Coventry. 2020. Demonstratives in spatial language and social interaction: An interdisciplinary review. Frontiers in Psychology 11 (2020) 555265.","DOI":"10.3389\/fpsyg.2020.555265"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","unstructured":"Mustafa\u00a0Doga Dogan Eric\u00a0J. Gonzalez Karan Ahuja Ruofei Du Andrea Cola\u00e7o Johnny Lee Mar Gonzalez-Franco and David Kim. 2024. Augmented Object Intelligence with XR-Objects. 10.1145\/3654777.3676379 arxiv:https:\/\/arXiv.org\/abs\/2404.13274\u00a0[cs.HC]","DOI":"10.1145\/3654777.3676379"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Runlin Duan Xiyun Hu Min Liu Jingyu Shi and Karthik Ramani. 2025. pARametric: Empowering In Situ Parametric Modeling in Augment Reality for Personal Fabrication. Journal of Computing and Information Science in Engineering 25 4 (2025) 041001.","DOI":"10.1115\/1.4067704"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545651"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.5220\/0010170501190127"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"crossref","unstructured":"Ad\u00e9la\u00efde Genay Anatole L\u00e9cuyer and Martin Hachet. 2021. Being an avatar \u201cfor real\u201d: a survey on virtual embodiment in augmented reality. IEEE Transactions on Visualization and Computer Graphics 28 12 (2021) 5071\u20135090.","DOI":"10.1109\/TVCG.2021.3099290"},{"key":"e_1_3_3_2_36_2","unstructured":"Esam Ghaleb Ilya Burenko Marlou Rasenberg Wim Pouw Ivan Toni Peter Uhrig Anna Wilson Judith Holler Asl\u0131 \u00d6zy\u00fcrek and Raquel Fern\u00e1ndez. 2024. Leveraging Speech for Gesture Detection in Multimodal Communication. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.14952 (2024)."},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00396"},{"key":"e_1_3_3_2_38_2","unstructured":"GitHub. 2023. GitHub Copilot. https:\/\/github.com\/features\/copilot Accessed: 2024-09-11."},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/VR58804.2024.00078"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"crossref","unstructured":"Lin Guo Zongxing Lu and Ligang Yao. 2021. Human-machine interaction sensing technology based on hand gesture recognition: A review. IEEE Transactions on Human-Machine Systems 51 4 (2021) 300\u2013309.","DOI":"10.1109\/THMS.2021.3086003"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"crossref","unstructured":"Fengming He Xiyun Hu Xun Qian Zhengzhe Zhu and Karthik Ramani. 2024. AdapTUI: Adaptation of Geometric-Feature-Based Tangible User Interfaces in Augmented Reality. Proceedings of the ACM on Human-Computer Interaction 8 ISS (2024) 44\u201369.","DOI":"10.1145\/3698127"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580704"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Henning Holle Thomas\u00a0C Gunter Shirley-Ann R\u00fcschemeyer Andreas Hennenlotter and Marco Iacoboni. 2008. Neural correlates of the processing of co-speech gestures. NeuroImage 39 4 (2008) 2010\u20132024.","DOI":"10.1016\/j.neuroimage.2007.10.055"},{"key":"e_1_3_3_2_44_2","unstructured":"Yining Hong Haoyu Zhen Peihao Chen Shuhong Zheng Yilun Du Zhenfang Chen and Chuang Gan. 2023. 3d-llm: Injecting the 3d world into large language models. Advances in Neural Information Processing Systems 36 (2023) 20482\u201320494."},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806266"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00094"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606793"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"crossref","unstructured":"Spencer\u00a0D Kelly and Quang-Anh Ngo\u00a0Tran. 2023. Exploring the Emotional Functions of Co-Speech Hand Gesture in Language and Communication. Topics in Cognitive Science (2023).","DOI":"10.1111\/tops.12657"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501931"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/2984511.2984567"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"crossref","unstructured":"Sotaro Kita and Asli \u00d6zy\u00fcrek. 2003. What does cross-linguistic variation in semantic coordination of speech and gesture reveal?: Evidence for an interface representation of spatial thinking and speaking. Journal of Memory and language 48 1 (2003) 16\u201332.","DOI":"10.1016\/S0749-596X(02)00505-3"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1007\/BFb0052986"},{"key":"e_1_3_3_2_53_2","unstructured":"Mikhail Konenkov Artem Lykov Daria Trinitatova and Dzmitry Tsetserukou. 2024. Vr-gpt: Visual language model for intelligent virtual reality applications. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.11537 (2024)."},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2019.8756576"},{"key":"e_1_3_3_2_55_2","unstructured":"Ryutaro Kurai Takefumi Hiraki Yuichi Hiroi Yutaro Hirao Monica Perusquia-Hernandez Hideaki Uchiyama and Kiyoshi Kiyokawa. 2024. MagicItem: Dynamic Behavior Design of Virtual Objects with Large Language Models in a Consumer Metaverse Platform. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.13242 (2024)."},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173610"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642230"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"crossref","unstructured":"Minkyung Lee Mark Billinghurst Woonhyuk Baek Richard Green and Woontack Woo. 2013. A usability study of multimodal input in an augmented reality environment. Virtual Reality 17 (2013) 293\u2013305.","DOI":"10.1007\/s10055-013-0230-0"},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","DOI":"10.1109\/IVCNZ.2008.4762125"},{"key":"e_1_3_3_2_60_2","unstructured":"Bo Li Yuanhan Zhang Dong Guo Renrui Zhang Feng Li Hao Zhang Kaichen Zhang Peiyuan Zhang Yanwei Li Ziwei Liu et\u00a0al. 2024. Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.03326 (2024)."},{"key":"e_1_3_3_2_61_2","unstructured":"Fangfu Liu Hanyang Wang Weiliang Chen Haowen Sun and Yueqi Duan. 2024. Make-Your-3D: Fast and Consistent Subject-Driven 3D Content Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.09625 (2024)."},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642947"},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/2207676.2208693"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676400"},{"key":"e_1_3_3_2_65_2","unstructured":"Dr\u00a0M Madhiarasan Prof Roy and Partha Pratim. 2022. A comprehensive review of sign language recognition: Different types modalities and datasets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.03328 (2022)."},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"crossref","unstructured":"Daniel Martin Sandra Malpica Diego Gutierrez Belen Masia and Ana Serrano. 2022. Multimodality in VR: A survey. ACM Computing Surveys (CSUR) 54 10s (2022) 1\u201336.","DOI":"10.1145\/3508361"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"crossref","unstructured":"Ingrid Masson-Carro Martijn Goudbeek and Emiel Krahmer. 2017. How what we see and what we know influence iconic gesture production. Journal of nonverbal behavior 41 (2017) 367\u2013394.","DOI":"10.1007\/s10919-017-0261-4"},{"key":"e_1_3_3_2_68_2","doi-asserted-by":"publisher","DOI":"10.1109\/3DUI.2014.6798833"},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"publisher","DOI":"10.1145\/2993369.2993396"},{"key":"e_1_3_3_2_70_2","unstructured":"Meta XR SDK 2024. https:\/\/developer.oculus.com\/downloads\/package\/meta-xr-sdk-all-in-one-upm\/."},{"key":"e_1_3_3_2_71_2","unstructured":"Microsoft. 2024. Mixed Reality Toolkit (MRTK). https:\/\/github.com\/microsoft\/MixedRealityToolkit-Unity. Accessed: 2024-12-11."},{"key":"e_1_3_3_2_72_2","doi-asserted-by":"publisher","DOI":"10.1117\/12.197321"},{"key":"e_1_3_3_2_73_2","doi-asserted-by":"crossref","unstructured":"Abdullah Mujahid Mazhar\u00a0Javed Awan Awais Yasin Mazin\u00a0Abed Mohammed Robertas Dama\u0161evi\u010dius Rytis Maskeli\u016bnas and Karrar\u00a0Hameed Abdulkareem. 2021. Real-time hand gesture recognition based on deep learning YOLOv3 model. Applied Sciences 11 9 (2021) 4164.","DOI":"10.3390\/app11094164"},{"key":"e_1_3_3_2_74_2","doi-asserted-by":"publisher","DOI":"10.1145\/293701.293708"},{"key":"e_1_3_3_2_75_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642440"},{"key":"e_1_3_3_2_76_2","doi-asserted-by":"crossref","unstructured":"SK Ong and ZB Wang. 2011. Augmented assembly technologies based on 3D bare-hand interaction. CIRP annals 60 1 (2011) 1\u20134.","DOI":"10.1016\/j.cirp.2011.03.001"},{"key":"e_1_3_3_2_77_2","unstructured":"OpenAI. 2023. ChatGPT: GPT-4 Large Language Model. https:\/\/chat.openai.com\/ Accessed: 2024-09-11."},{"key":"e_1_3_3_2_78_2","unstructured":"OpenAI GPT-4o 2024. =https:\/\/openai.com\/index\/hello-gpt-4o\/."},{"key":"e_1_3_3_2_79_2","doi-asserted-by":"publisher","DOI":"10.1145\/3643834.3661588"},{"key":"e_1_3_3_2_80_2","doi-asserted-by":"crossref","unstructured":"Raedy Ping and Susan Goldin-Meadow. 2010. Gesturing saves cognitive resources when talking about nonpresent objects. Cognitive Science 34 4 (2010) 602\u2013619.","DOI":"10.1111\/j.1551-6709.2010.01102.x"},{"key":"e_1_3_3_2_81_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR.2014.6948411"},{"key":"e_1_3_3_2_82_2","doi-asserted-by":"publisher","DOI":"10.1109\/3DUI.2017.7893315"},{"key":"e_1_3_3_2_83_2","unstructured":"Ben Poole Ajay Jain Jonathan\u00a0T Barron and Ben Mildenhall. 2022. Dreamfusion: Text-to-3d using 2d diffusion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2209.14988 (2022)."},{"key":"e_1_3_3_2_84_2","doi-asserted-by":"crossref","unstructured":"Sara Price Carey Jewitt and Nikoleta Yiannoutsou. 2021. Conceptualising touch in VR. Virtual Reality 25 3 (2021) 863\u2013877.","DOI":"10.1007\/s10055-020-00494-y"},{"key":"e_1_3_3_2_85_2","doi-asserted-by":"crossref","unstructured":"Wen Qi Salih\u00a0Ertug Ovur Zhijun Li Aldo Marzullo and Rong Song. 2021. Multi-sensor guided hand gesture recognition for a teleoperated robot using a recurrent neural network. IEEE Robotics and Automation Letters 6 3 (2021) 6039\u20136045.","DOI":"10.1109\/LRA.2021.3089999"},{"key":"e_1_3_3_2_86_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517665"},{"key":"e_1_3_3_2_87_2","doi-asserted-by":"crossref","unstructured":"Siddharth\u00a0S Rautaray and Anupam Agrawal. 2015. Vision based hand gesture recognition for human computer interaction: a survey. Artificial intelligence review 43 (2015) 1\u201354.","DOI":"10.1007\/s10462-012-9356-9"},{"key":"e_1_3_3_2_88_2","unstructured":"Ra\u00fal\u00a0A S\u00e1nchez-Ancajima Sarajane\u00a0Marques Peres Javier\u00a0A L\u00f3pez-C\u00e9spedes Jos\u00e9\u00a0L Saly-Rosas-solano Ronald\u00a0M Hern\u00e1ndez and Miguel\u00a0A Saavedra-L\u00f3pez. [n. d.]. Gesture Phase Segmentation Dataset: An Extension for Development of Gesture Analysis Models. ([n. d.])."},{"key":"e_1_3_3_2_89_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606736"},{"key":"e_1_3_3_2_90_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713348"},{"key":"e_1_3_3_2_91_2","unstructured":"Jingyu Shi Rahul Jain Hyungjun Doh Ryo Suzuki and Karthik Ramani. 2023. An HCI-centric survey and taxonomy of human-generative-AI interactions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.07127 (2023)."},{"key":"e_1_3_3_2_92_2","first-page":"21","volume-title":"International Gesture Workshop","author":"Sowa Timo","year":"2001","unstructured":"Timo Sowa and Ipke Wachsmuth. 2001. Interpretation of shape-related iconic gestures in virtual environments. In International Gesture Workshop. Springer, 21\u201333."},{"key":"e_1_3_3_2_93_2","doi-asserted-by":"crossref","unstructured":"Benjamin Straube Antonia Green Bianca Bromberger and Tilo Kircher. 2011. The differentiation of iconic and metaphoric gestures: Common and unique integration processes. Human brain mapping 32 4 (2011) 520\u2013533.","DOI":"10.1002\/hbm.21041"},{"key":"e_1_3_3_2_94_2","doi-asserted-by":"crossref","unstructured":"Hendrik Strobelt Albert Webson Victor Sanh Benjamin Hoover Johanna Beyer Hanspeter Pfister and Alexander\u00a0M Rush. 2022. Interactive and visual prompt engineering for ad-hoc task adaptation with large language models. IEEE transactions on visualization and computer graphics 29 1 (2022) 1146\u20131156.","DOI":"10.1109\/TVCG.2022.3209479"},{"key":"e_1_3_3_2_95_2","doi-asserted-by":"crossref","unstructured":"Michael Studdert-Kennedy. 1994. Hand and Mind: What Gestures Reveal About Thought. Language and Speech 37 2 (1994) 203\u2013209.","DOI":"10.1177\/002383099403700208"},{"key":"e_1_3_3_2_96_2","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Yonghui Wu Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew\u00a0M Dai Anja Hauth et\u00a0al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.11805 (2023)."},{"key":"e_1_3_3_2_97_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_2_98_2","doi-asserted-by":"publisher","DOI":"10.1145\/2967175.2967385"},{"key":"e_1_3_3_2_99_2","doi-asserted-by":"crossref","unstructured":"Petra Wagner Zofia Malisz and Stefan Kopp. 2014. Gesture and speech in interaction: An overview. 209\u2013232\u00a0pages.","DOI":"10.1016\/j.specom.2013.09.008"},{"key":"e_1_3_3_2_100_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613905.3651026"},{"key":"e_1_3_3_2_101_2","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et\u00a0al. 2024. Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.12191 (2024)."},{"key":"e_1_3_3_2_102_2","doi-asserted-by":"publisher","DOI":"10.1145\/3472749.3474769"},{"key":"e_1_3_3_2_103_2","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415815"},{"key":"e_1_3_3_2_104_2","unstructured":"Weihan Wang Qingsong Lv Wenmeng Yu Wenyi Hong Ji Qi Yan Wang Junhui Ji Zhuoyi Yang Lei Zhao Xixuan Song Jiazheng Xu Bin Xu Juanzi Li Yuxiao Dong Ming Ding and Jie Tang. 2023. CogVLM: Visual Expert for Pretrained Language Models. (2023). arxiv:https:\/\/arXiv.org\/abs\/2311.03079\u00a0[cs.CV]"},{"key":"e_1_3_3_2_105_2","unstructured":"Zehan Wang Haifeng Huang Yang Zhao Ziang Zhang and Zhou Zhao. 2023. Chat-3d: Data-efficiently tuning large language model for universal dialogue of 3d scenes. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.08769 (2023)."},{"key":"e_1_3_3_2_106_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445552"},{"key":"e_1_3_3_2_107_2","doi-asserted-by":"crossref","unstructured":"Rong Wen Wei-Liang Tay Binh\u00a0P Nguyen Chin-Boon Chng and Chee-Kong Chui. 2014. Hand gesture guided robot-assisted surgery based on a direct augmented reality interface. Computer methods and programs in biomedicine 116 2 (2014) 68\u201380.","DOI":"10.1016\/j.cmpb.2013.12.018"},{"key":"e_1_3_3_2_108_2","doi-asserted-by":"crossref","unstructured":"Roel\u00a0M Willems Asl\u0131 \u00d6zy\u00fcrek and Peter Hagoort. 2007. When language meets action: The neural integration of gesture and speech. Cerebral Cortex 17 10 (2007) 2322\u20132333.","DOI":"10.1093\/cercor\/bhl141"},{"key":"e_1_3_3_2_109_2","doi-asserted-by":"publisher","DOI":"10.1145\/3643834.3660691"},{"key":"e_1_3_3_2_110_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517582"},{"key":"e_1_3_3_2_111_2","doi-asserted-by":"crossref","unstructured":"Ying\u00a0Choon Wu and Seana Coulson. 2014. Co-speech iconic gestures and visuo-spatial working memory. Acta psychologica 153 (2014) 39\u201350.","DOI":"10.1016\/j.actpsy.2014.09.002"},{"key":"e_1_3_3_2_112_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173652"},{"key":"e_1_3_3_2_113_2","doi-asserted-by":"crossref","unstructured":"LI Yang Jin Huang TIAN Feng WANG Hong-An and DAI Guo-Zhong. 2019. Gesture interaction in virtual reality. Virtual Reality & Intelligent Hardware 1 1 (2019) 84\u2013112.","DOI":"10.3724\/SP.J.2096-5796.2018.0006"},{"key":"e_1_3_3_2_114_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517689"},{"key":"e_1_3_3_2_115_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2016.7899949"},{"key":"e_1_3_3_2_116_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR55827.2022.00081"},{"key":"e_1_3_3_2_117_2","doi-asserted-by":"crossref","unstructured":"Lei Zhang Jin Pan Jacob Gettig Steve Oney and Anhong Guo. 2024. VRCopilot: Authoring 3D Layouts with Generative AI Models in VR. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.09382 (2024).","DOI":"10.1145\/3654777.3676451"},{"key":"e_1_3_3_2_118_2","doi-asserted-by":"crossref","unstructured":"Zeyi Zhang Tenglong Ao Yuyao Zhang Qingzhe Gao Chuan Lin Baoquan Chen and Libin Liu. 2024. Semantic Gesticulator: Semantics-Aware Co-Speech Gesture Synthesis. ACM Transactions on Graphics (TOG) 43 4 (2024) 1\u201317.","DOI":"10.1145\/3658134"},{"key":"e_1_3_3_2_119_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376313"},{"key":"e_1_3_3_2_120_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR59233.2023.00090"}],"event":{"name":"DIS '25: Designing Interactive Systems Conference","location":"Madeira Portugal","acronym":"DIS '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2025 ACM Designing Interactive Systems Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3715336.3735769","content-type":"text\/html","content-version":"vor","intended-application":"syndication"}],"deposited":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T11:22:54Z","timestamp":1751628174000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3715336.3735769"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,4]]},"references-count":119,"alternative-id":["10.1145\/3715336.3735769","10.1145\/3715336"],"URL":"https:\/\/doi.org\/10.1145\/3715336.3735769","relation":{},"subject":[],"published":{"date-parts":[[2025,7,4]]},"assertion":[{"value":"2025-07-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}