{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T00:55:28Z","timestamp":1773536128299,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","funder":[{"name":"JST Moonshot R and D","award":["JPMJMS2011"],"award-info":[{"award-number":["JPMJMS2011"]}]},{"name":"JSPS KAKENHI","award":["24H00722"],"award-info":[{"award-number":["24H00722"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,16]]},"DOI":"10.1145\/3757279.3785554","type":"proceedings-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T00:27:38Z","timestamp":1773102458000},"page":"854-863","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Communicating Object Relations through Robot Gestures"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3483-4578","authenticated-orcid":false,"given":"Xiang","family":"Pan","sequence":"first","affiliation":[{"name":"Kyoto University, Kyoto, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2805-0187","authenticated-orcid":false,"given":"Malcolm","family":"Doering","sequence":"additional","affiliation":[{"name":"Kyoto University, Kyoto, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9546-5825","authenticated-orcid":false,"given":"Takayuki","family":"Kanda","sequence":"additional","affiliation":[{"name":"Kyoto University, Kyoto, Japan"}]}],"member":"320","published-online":{"date-parts":[[2026,3,16]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","unstructured":"Michael Ahn Anthony Brohan Noah Brown Yevgen Chebotar Omar Cortes Byron David Chelsea Finn Chuyuan Fu Keerthana Gopalakrishnan Karol Hausman et al. 2022. Do as i can not as i say: Grounding language in robotic affordances. arXiv preprint arXiv:2204.01691 https:\/\/doi.org\/10.48550\/arXiv.2204.01691 10.48550\/arXiv.2204.01691","DOI":"10.48550\/arXiv.2204.01691"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1207\/s15427633scc0504_2"},{"key":"e_1_3_2_2_3_1","volume-title":"Teachers","author":"Alibali Martha W","unstructured":"Martha W Alibali and Mitchell J Nathan. 2014. Teachers\u2019 gestures as a means of scaffolding students\u2019 understanding: Evidence from an early algebra lesson. In Video research in the learning sciences. Routledge, 349\u2013365."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475223"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2909824.3020208"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/192161.192272"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680847"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1037\/0033-2909.112.1.155"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-25554-5_18"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3549530"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2696454.2696473"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/HRI.2013.6483603"},{"key":"e_1_3_2_2_13_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning.","author":"Driess Danny","year":"2023","unstructured":"Danny Driess, Fei Xia, Mehdi SM Sajjadi, Corey Lynch, Aakanksha Chowdhery, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, Wenlong Huang, et al. 2023. Palm-e: An embodied multimodal language model. In Proceedings of the 40th International Conference on Machine Learning."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","unstructured":"Michael J Gielniak and Andrea L Thomaz. 2011. Generating anticipation in robot motion. In 2011 RO-MAN. 449\u2013454. https:\/\/doi.org\/10.1109\/ROMAN.2011.6005255 10.1109\/ROMAN.2011.6005255","DOI":"10.1109\/ROMAN.2011.6005255"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","unstructured":"Christian Heath. 1992. Gesture\u2019s discreet tasks: Multiple relevancies in visual conduct and in the contextualisation of language. https:\/\/doi.org\/10.1075\/pbns.22.08hea 10.1075\/pbns.22.08hea","DOI":"10.1075\/pbns.22.08hea"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2025.3575983"},{"key":"e_1_3_2_2_17_1","volume-title":"Proceedings of the 6th Conference on Robot Learning (CoRL).","author":"Huang Wenlong","year":"2022","unstructured":"Wenlong Huang, Fei Xia, Ted Xiao, Harris Chan, Jacky Liang, Pete Florence, Andy Zeng, Jonathan Tompson, Igor Mordatch, Yevgen Chebotar, et al. 2022. Inner Monologue: Embodied Reasoning through Planning with Language Models. In Proceedings of the 6th Conference on Robot Learning (CoRL)."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.3389\/fpsyg.2020.575991"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ROBOT.2002.1014810"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS45743.2020.9341067"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1207\/s15327973rlsi2703_2"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511807572"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2406.09246"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/11821830_17"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/HUMANOIDS.2011.6100857"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/s12369-010-0071-x"},{"key":"e_1_3_2_2_27_1","volume-title":"IEEE\/CVF International Conference on Computer Vision.","author":"Liu Pinxin","year":"2025","unstructured":"Pinxin Liu, Luchuan Song, Junhua Huang, and Chenliang Xu. 2025. GestureLSM: Latent Shortcut based Co-Speech Gesture Generation with Spatial-Temporal Modeling. In IEEE\/CVF International Conference on Computer Vision."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610977.3634999"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1515\/9783110874259.351"},{"key":"e_1_3_2_2_30_1","volume-title":"Proceedings of the 7th Conference on Robot Learning (CoRL).","author":"Mirchandani Suvir","year":"2023","unstructured":"Suvir Mirchandani, Fei Xia, Pete Florence, Brian Ichter, Danny Driess, Montserrat Gonzalez Arenas, Kanishka Rao, Dorsa Sadigh, and Andy Zeng. 2023. Large Language Models as General Pattern Machines. In Proceedings of the 7th Conference on Robot Learning (CoRL)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.14776"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610977.3634986"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/HRI61500.2025.10973989"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1117\/12.3060395"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/568513.568514"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.3102\/00346543071003365"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAMD.2014.2312399"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3511409"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/s12369-020-00667-4"},{"key":"e_1_3_2_2_40_1","volume-title":"Proceedings of the 7th Conference on Robot Learning (CoRL).","author":"Tang Yujin","year":"2023","unstructured":"Yujin Tang, Wenhao Yu, Jie Tan, Heiga Zen, Aleksandra Faust, and Tatsuya Harada. 2023. Saytap: Language to quadrupedal locomotion. In Proceedings of the 7th Conference on Robot Learning (CoRL)."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10341944"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2013.09.008"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","unstructured":"Beichen Wang Juexiao Zhang Shuwen Dong Irving Fang and Chen Feng. 2024. Vlm see robot do: Human demo video to robot action plan via vision language model. arXiv preprint arXiv:2410.08792 https:\/\/doi.org\/10.48550\/arXiv.2410.08792 10.48550\/arXiv.2410.08792","DOI":"10.48550\/arXiv.2410.08792"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793720"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1111\/cogs.13428"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3658134"},{"key":"e_1_3_2_2_48_1","volume-title":"Proceedings of the 7th Conference on Robot Learning (CoRL).","author":"Zitkovich Brianna","year":"2023","unstructured":"Brianna Zitkovich, Tianhe Yu, Sichun Xu, Peng Xu, Ted Xiao, Fei Xia, Jialin Wu, Paul Wohlhart, Stefan Welker, Ayzaan Wahid, et al. 2023. Rt-2: Vision-language-action models transfer web knowledge to robotic control. In Proceedings of the 7th Conference on Robot Learning (CoRL)."}],"event":{"name":"HRI '26: 21st ACM\/IEEE International Conference on Human-Robot Interaction","location":"Edinburgh Scotland UK","acronym":"HRI '26","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","SIGCHI ACM Special Interest Group on Computer-Human Interaction","IEEE RAS"]},"container-title":["Proceedings of the 21st ACM\/IEEE International Conference on Human-Robot Interaction"],"original-title":[],"deposited":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T00:35:32Z","timestamp":1773534932000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3757279.3785554"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,16]]},"references-count":48,"alternative-id":["10.1145\/3757279.3785554","10.1145\/3757279"],"URL":"https:\/\/doi.org\/10.1145\/3757279.3785554","relation":{},"subject":[],"published":{"date-parts":[[2026,3,16]]},"assertion":[{"value":"2026-03-16","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}