{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:59:47Z","timestamp":1776931187586,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":123,"publisher":"ACM","license":[{"start":{"date-parts":[[2027,4,13]],"date-time":"2027-04-13T00:00:00Z","timestamp":1807574400000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2047191"],"award-info":[{"award-number":["2047191"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3772318.3790618","type":"proceedings-article","created":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T04:12:28Z","timestamp":1776053548000},"page":"1-15","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Gesturing Toward Abstraction: Multimodal Convention Formation in Collaborative Physical Tasks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3270-1974","authenticated-orcid":false,"given":"Kiyosu","family":"Maeda","sequence":"first","affiliation":[{"name":"Princeton University, Princeton, New Jersey, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3084-7152","authenticated-orcid":false,"given":"William P","family":"McCarthy","sequence":"additional","affiliation":[{"name":"Cognitive Science, University of California, San Diego, La Jolla, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5664-6562","authenticated-orcid":false,"given":"Ching-Yi","family":"Tsai","sequence":"additional","affiliation":[{"name":"Princeton University, Princeton, New Jersey, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8925-8596","authenticated-orcid":false,"given":"Jeffrey","family":"Mu","sequence":"additional","affiliation":[{"name":"Brown University, Providence, Rhode Island, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7529-6991","authenticated-orcid":false,"given":"Haoliang","family":"Wang","sequence":"additional","affiliation":[{"name":"MIT, Cambridge, Massachusetts, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9089-8544","authenticated-orcid":false,"given":"Robert","family":"Hawkins","sequence":"additional","affiliation":[{"name":"Department of Linguistics, Stanford University, Stanford, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0097-3254","authenticated-orcid":false,"given":"Judith E.","family":"Fan","sequence":"additional","affiliation":[{"name":"Psychology, Stanford University, Stanford, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2145-3445","authenticated-orcid":false,"given":"Parastoo","family":"Abtahi","sequence":"additional","affiliation":[{"name":"Computer Science, Princeton University, Princeton, New Jersey, USA"}]}],"member":"320","published-online":{"date-parts":[[2026,4,13]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"Olga Abramov Friederike Kern Sofia Koutalidis Ulrich Mertens Katharina Rohlfing and Stefan Kopp. 2021. The relation between cognitive abilities and the distribution of semantic features across speech and gesture in 4-year-olds. Cognitive Science 45 7 (2021).","DOI":"10.1111\/cogs.13012"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"crossref","unstructured":"Martha\u00a0W. Alibali Miriam Bassok Karen\u00a0Olseth Solomon Sharon\u00a0E. Syc and Susan Goldin-Meadow. 1999. Illuminating mental representations through speech and gesture. Psychological Science 10 4 (1999) 327\u2013333.","DOI":"10.1111\/1467-9280.00163"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/2702613.2732927"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-43488-9_20"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Richard\u00a0N. Aslin Jenny\u00a0R. Saffran and Elissa\u00a0L. Newport. 1998. Computation of conditional probability statistics by 8-month-old infants. Psychological Science 9 4 (1998) 321\u2013324.","DOI":"10.1111\/1467-9280.00063"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Joseph\u00a0L. Austerweil and Thomas\u00a0L. Griffiths. 2013. A nonparametric Bayesian framework for constructing flexible feature representations. Psychological Review 120 4 (2013) 817.","DOI":"10.1037\/a0034194"},{"key":"e_1_3_3_2_8_2","first-page":"464","volume-title":"International Conference on Machine Learning","author":"Bapst Victor","year":"2019","unstructured":"Victor Bapst, Alvaro Sanchez-Gonzalez, Carl Doersch, Kimberly Stachenfeld, Pushmeet Kohli, Peter Battaglia, and Jessica Hamrick. 2019. Structured agents for physical construction. In International Conference on Machine Learning. PMLR, 464\u2013474."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISWC.1999.806696"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Sian\u00a0L. Beilock and Susan Goldin-Meadow. 2010. Gesture changes thought by grounding it in action. Psychological science 21 11 (2010) 1605\u20131610.","DOI":"10.1177\/0956797610385353"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/225434.225452"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/2207676.2207704"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/800250.807503"},{"key":"e_1_3_3_2_14_2","volume-title":"Proceedings of the Annual Meeting of the Cognitive Science Society","volume":"45","author":"Boyce Veronica","year":"2023","unstructured":"Veronica Boyce and Michael\u00a0C. Frank. 2023. Communicative reduction in referring expressions within a multi-player negotiation game. In Proceedings of the Annual Meeting of the Cognitive Science Society , Vol.\u00a045. Wiley-Blackwell, USA."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","unstructured":"Veronica Boyce Robert\u00a0D. Hawkins Noah\u00a0D. Goodman and Michael\u00a0C. Frank. 2024. Interaction structure constrains the emergence of conventions in group communication. Proceedings of the National Academy of Sciences 121 28 (2024) e2403888121. 10.1073\/pnas.2403888121","DOI":"10.1073\/pnas.2403888121"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Neil\u00a0R. Bramley and Fei Xu. 2023. Active inductive inference in children and adults: A constructivist perspective. Cognition 238 (2023) 105471.","DOI":"10.1016\/j.cognition.2023.105471"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Susan\u00a0E Brennan and Herbert\u00a0H Clark. 1996. Conceptual pacts and lexical choice in conversation. Journal of experimental psychology: Learning memory and cognition 22 6 (1996) 1482.","DOI":"10.1037\/0278-7393.22.6.1482"},{"key":"e_1_3_3_2_18_2","volume-title":"Proceedings of the thirteenth language resources and evaluation conference","author":"Brutti Richard","year":"2022","unstructured":"Richard Brutti, Lucia Donatelli, Kenneth Lai, and James Pustejovsky. 2022. Abstract meaning representation for gesture. In Proceedings of the thirteenth language resources and evaluation conference. https:\/\/par.nsf.gov\/servlets\/purl\/10409402"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00142"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Morten\u00a0H. Christiansen and Nick Chater. 2016. The now-or-never bottleneck: A fundamental constraint on language. Behavioral and Brain Sciences 39 (2016).","DOI":"10.1017\/S0140525X1500031X"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Mingyuan Chu and Sotaro Kita. 2008. Spontaneous gestures during mental rotation tasks: insights into the microdevelopment of the motor strategy. Journal of Experimental Psychology: General 137 4 (2008) 706.","DOI":"10.1037\/a0013157"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Mingyuan Chu and Sotaro Kita. 2011. The nature of gestures\u2019 beneficial role in spatial problem solving. Journal of experimental psychology: General 140 1 (2011) 102.","DOI":"10.1037\/a0021790"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511620539"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"crossref","unstructured":"Herbert\u00a0H. Clark and Deanna Wilkes-Gibbs. 1986. Referring as a collaborative process. Cognition 22 1 (1986) 1\u201339.","DOI":"10.1016\/0010-0277(86)90010-7"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Jan\u00a0Peter De\u00a0Ruiter. 2006. Can gesticulation help aphasic people speak or rather communicate? Advances in Speech Language Pathology 8 2 (2006) 124\u2013127.","DOI":"10.1080\/14417040600667285"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Judith Degen. 2023. The rational speech act framework. Annual Review of Linguistics 9 1 (2023) 519\u2013540.","DOI":"10.1146\/annurev-linguistics-031220-010811"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"crossref","unstructured":"Judith Degen Robert\u00a0D. Hawkins Caroline Graf Elisa Kreiss and Noah\u00a0D. Goodman. 2020. When redundancy is useful: A Bayesian approach to \u201coverinformative\u201d referring expressions. Psychological Review 127 4 (2020) 591.","DOI":"10.1037\/rev0000186"},{"key":"e_1_3_3_2_28_2","volume-title":"Proceedings of the 43rd Annual Meeting of the Cognitive Science Society, CogSci 2021, virtual, July 26-29, 2021","author":"Degen Judith","year":"2021","unstructured":"Judith Degen, Leyla Kursat, and Daisy\u00a0Dorothy Leigh. 2021. Seeing is believing: testing an explicit linking assumption for visual world eye-tracking in psycholinguistics. In Proceedings of the 43rd Annual Meeting of the Cognitive Science Society, CogSci 2021, virtual, July 26-29, 2021."},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2011.6094592"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Nicolas Fay Simon Garrod Leo Roberts and Nik Swoboda. 2010. The interactive evolution of human communication systems. Cognitive Science 34 3 (2010) 351\u2013386.","DOI":"10.1111\/j.1551-6709.2009.01090.x"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Nicolas Fay Casey\u00a0J. Lister T.\u00a0Mark Ellison and Susan Goldin-Meadow. 2014. Creating a communication system from scratch: gesture beats vocalization hands down. Frontiers in Psychology 5 (2014) 354.","DOI":"10.3389\/fpsyg.2014.00354"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Michael\u00a0C. Frank and Noah\u00a0D. Goodman. 2012. Predicting pragmatic reasoning in language games. Science 336 6084 (2012) 998\u2013998.","DOI":"10.1126\/science.1218633"},{"key":"e_1_3_3_2_33_2","unstructured":"Daniel Fried Nicholas Tomlin Jennifer Hu Roma Patel and Aida Nematzadeh. 2022. Pragmatics in language grounding: Phenomena tasks and modeling approaches. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.08371 (2022)."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/358916.358947"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","unstructured":"Susan\u00a0R. Fussell Leslie\u00a0D. Setlock Jie Yang Jiazhi Ou Elizabeth Mauer and Adam D.\u00a0I. Kramer. 2004. Gestures Over Video Streams to Support Remote Collaboration on Physical Tasks. Hum. Comput. Interact. 19 3 (2004) 273\u2013309. 10.1207\/S15327051HCI1903_3","DOI":"10.1207\/S15327051HCI1903_3"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"crossref","unstructured":"Simon Garrod and Gwyneth Doherty. 1994. Conversation co-ordination and convention: An empirical investigation of how groups establish linguistic conventions. Cognition 53 3 (1994) 181\u2013215.","DOI":"10.1016\/0010-0277(94)90048-5"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/1031607.1031687"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"crossref","unstructured":"Darren Gergle Robert\u00a0E. Kraut and Susan\u00a0R. Fussell. 2004. Language efficiency and visual technology: Minimizing collaborative effort with visual information. Journal of language and social psychology 23 4 (2004) 491\u2013517.","DOI":"10.1177\/0261927X04269589"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/1124772.1124968"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ROMAN.2011.6005255"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/HRI.2013.6483609"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.2307\/j.ctv1w9m9ds"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Susan Goldin-Meadow. 2006. Talking and thinking with our hands. Current directions in psychological science 15 1 (2006) 34\u201339.","DOI":"10.1111\/j.0963-7214.2006.00402.x"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Susan Goldin-Meadow and Susan\u00a0M Wagner. 2005. How our hands help us learn. Trends in cognitive sciences 9 5 (2005) 234\u2013241.","DOI":"10.1016\/j.tics.2005.03.006"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"crossref","unstructured":"Noah\u00a0D. Goodman and Michael\u00a0C. Frank. 2016. Pragmatic language interpretation as probabilistic inference. Trends in cognitive sciences 20 11 (2016) 818\u2013829.","DOI":"10.1016\/j.tics.2016.08.005"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"crossref","unstructured":"Noah\u00a0D. Goodman and Michael\u00a0C. Frank. 2016. Pragmatic language interpretation as probabilistic inference. Trends in Cognitive Sciences 20 11 (2016) 818 \u2013 829.","DOI":"10.1016\/j.tics.2016.08.005"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1163\/9789004368811_003"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"crossref","unstructured":"Marianne Gullberg and Kenneth Holmqvist. 2006. What speakers do and what addressees look at: Visual attention to gestures in human interaction live and on video. Pragmatics & Cognition 14 1 (2006) 53\u201382.","DOI":"10.1075\/pc.14.1.05gul"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"crossref","unstructured":"Marianne Gullberg and Sotaro Kita. 2009. Attention to speech-accompanying gestures: Eye movements and information uptake. Journal of nonverbal behavior 33 4 (2009) 251\u2013277.","DOI":"10.1007\/s10919-009-0073-2"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"crossref","unstructured":"Robert\u00a0D. Hawkins Michael Franke Michael\u00a0C. Frank Adele\u00a0E. Goldberg Kenny Smith Thomas\u00a0L. Griffiths and Noah\u00a0D. Goodman. 2023. From partners to populations: A hierarchical Bayesian account of coordination and convention. Psychological Review 130 4 (2023) 977.","DOI":"10.1037\/rev0000348"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2020.CONLL-1.33"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"crossref","unstructured":"Robert\u00a0D. Hawkins Megumi Sano Noah\u00a0D. Goodman and Judith\u00a0E. Fan. 2023. Visual resemblance and interaction history jointly constrain pictorial meaning. Nature Communications 14 1 (2023) 2199.","DOI":"10.1038\/s41467-023-37737-w"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","unstructured":"Henning Holle and Thomas\u00a0C. Gunter. 2007. The role of iconic gestures in speech disambiguation: ERP evidence. Journal of cognitive neuroscience 19 7 (2007) 1175\u20131192. 10.1162\/jocn.2007.19.7.1175","DOI":"10.1162\/jocn.2007.19.7.1175"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3715336.3735769"},{"key":"e_1_3_3_2_55_2","volume-title":"First Conference on Language Modeling","author":"Hua Yilun","year":"2024","unstructured":"Yilun Hua and Yoav Artzi. 2024. Talk Less, Interact Better: Evaluating In-context Conversational Adaptation in Multimodal LLMs. In First Conference on Language Modeling. https:\/\/openreview.net\/forum?id=lVOw78nYXS"},{"key":"e_1_3_3_2_56_2","volume-title":"Second Conference on Language Modeling","author":"Hua Yilun","year":"2025","unstructured":"Yilun Hua, Evan Wang, and Yoav Artzi. 2025. Post-training for Efficient Communication via Convention Formation. In Second Conference on Language Modeling. https:\/\/openreview.net\/forum?id=jRGGmbhX2s"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2009.5354149"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581277"},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","unstructured":"Jana\u00a0M. Iverson and Susan Goldin-Meadow. 2005. Gesture Paves the Way for Language Development. Psychological Science 16 5 (2005) 367\u2013371. 10.1111\/j.0956-7976.2005.01542.xPMID: 15869695.","DOI":"10.1111\/j.0956-7976.2005.01542.x"},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"crossref","unstructured":"Philip\u00a0L. Jackson Andrew\u00a0N. Meltzoff and Jean Decety. 2006. Neural circuits involved in imitation and perspective-taking. Neuroimage 31 1 (2006) 429\u2013439.","DOI":"10.1016\/j.neuroimage.2005.11.026"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2020.ACL-MAIN.232"},{"key":"e_1_3_3_2_62_2","series-title":"Proceedings of Machine Learning Research","first-page":"15144","volume-title":"International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA","volume":"202","author":"Jiang Guangyuan","year":"2023","unstructured":"Guangyuan Jiang, Manjie Xu, Shiji Xin, Wei Liang, Yujia Peng, Chi Zhang, and Yixin Zhu. 2023. MEWL: Few-shot multimodal word learning with referential uncertainty. In International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA(Proceedings of Machine Learning Research, Vol.\u00a0202). PMLR, New York, NY, 15144\u201315169. https:\/\/proceedings.mlr.press\/v202\/jiang23i.html"},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581444"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","unstructured":"R.\u00a0Kenny Jones Paul Guerrero Niloy\u00a0J. Mitra and Daniel Ritchie. 2023. ShapeCoder: Discovering Abstractions for Visual Programs from Unstructured Primitives. ACM Trans. Graph. 42 4 (2023). 10.1145\/3592416","DOI":"10.1145\/3592416"},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"crossref","unstructured":"Seokmin Kang and Barbara Tversky. 2016. From hands to minds: Gestures promote understanding. Cognitive Research: Principles and Implications 1 (2016) 1\u201315.","DOI":"10.1186\/s41235-016-0004-9"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"publisher","DOI":"10.1145\/3563657.3596065"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"publisher","unstructured":"Spencer\u00a0D. Kelly Asl\u0131 \u00d6zy\u00fcrek and Eric Maris. 2010. Two Sides of the Same Coin: Speech and Gesture Mutually Interact to Enhance Comprehension. Psychological Science 21 2 (2010) 260\u2013267. 10.1177\/0956797609357327PMID: 20424055.","DOI":"10.1177\/0956797609357327"},{"key":"e_1_3_3_2_68_2","volume-title":"Proceedings of the Meetings of the Cognitive Science Society","author":"Kessell Angela\u00a0M.","year":"2006","unstructured":"Angela\u00a0M. Kessell and Barbara Tversky. 2006. Using gestures and diagrams to think and talk about insight problems. In Proceedings of the Meetings of the Cognitive Science Society."},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"publisher","DOI":"10.1145\/2974804.2974814"},{"key":"e_1_3_3_2_70_2","doi-asserted-by":"crossref","unstructured":"Sotaro Kita Martha\u00a0W. Alibali and Mingyuan Chu. 2017. How do gestures influence thinking and speaking? The gesture-for-conceptualization hypothesis. Psychological review 124 3 (2017) 245.","DOI":"10.1037\/rev0000059"},{"key":"e_1_3_3_2_71_2","doi-asserted-by":"crossref","unstructured":"Bjorn B.\u00a0de Koning Katrina Mok Nadine Marcus and Paul Ayres. 2023. Investigating the role of hand perspective in learning from procedural animations. British Journal of Educational Psychology 93 (2023) 251\u2013269.","DOI":"10.1111\/bjep.12542"},{"key":"e_1_3_3_2_72_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01601"},{"key":"e_1_3_3_2_73_2","unstructured":"Robert\u00a0M. Krauss and Chi-Yue Chiu. 1998. Language and social behavior. (1998)."},{"key":"e_1_3_3_2_74_2","doi-asserted-by":"crossref","unstructured":"Robert\u00a0M. Krauss and Sidney Weinheimer. 1964. Changes in reference phrases as a function of frequency of usage in social interaction: A preliminary study. Psychonomic Science 1 1-12 (1964) 113\u2013114.","DOI":"10.3758\/BF03342817"},{"key":"e_1_3_3_2_75_2","doi-asserted-by":"crossref","unstructured":"Robert\u00a0E. Kraut Steven\u00a0H. Lewis and Lawrence\u00a0W. Swezey. 1982. Listener responsiveness and the coordination of conversation. Journal of personality and social psychology 43 4 (1982) 718.","DOI":"10.1037\/0022-3514.43.4.718"},{"key":"e_1_3_3_2_76_2","volume-title":"Proceedings of the 42th Annual Meeting of the Cognitive Science Society - Developing a Mind: Learning in Humans, Animals, and Machines, CogSci 2020, virtual, July 29 - August 1, 2020","author":"Kreiss Elisa","year":"2020","unstructured":"Elisa Kreiss and Judith Degen. 2020. Production expectations modulate contrastive inference. In Proceedings of the 42th Annual Meeting of the Cognitive Science Society - Developing a Mind: Learning in Humans, Animals, and Machines, CogSci 2020, virtual, July 29 - August 1, 2020."},{"key":"e_1_3_3_2_77_2","first-page":"13618","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"34","author":"Krishnaswamy Nikhil","year":"2020","unstructured":"Nikhil Krishnaswamy, Pradyumna Narayana, Rahul Bangar, Kyeongmin Rim, Dhruva Patil, David McNeely-White, Jaime Ruiz, Bruce Draper, Ross Beveridge, and James Pustejovsky. 2020. Diana\u2019s World: A Situated Multimodal Interactive Agent. In Proceedings of the AAAI Conference on Artificial Intelligence , Vol.\u00a034. 13618\u201313619."},{"key":"e_1_3_3_2_78_2","volume-title":"Proceedings of the 12th International Conference on Computational Semantics (IWCS)\u2014Short papers","author":"Krishnaswamy Nikhil","year":"2017","unstructured":"Nikhil Krishnaswamy, Pradyumna Narayana, Isaac Wang, Kyeongmin Rim, Rahul Bangar, Dhruva Patil, Gururaj Mulay, Ross Beveridge, Jaime Ruiz, Bruce Draper, and James Pustejovsky. 2017. Communicating and acting: Understanding gesture in simulation semantics. In Proceedings of the 12th International Conference on Computational Semantics (IWCS)\u2014Short papers."},{"key":"e_1_3_3_2_79_2","volume-title":"Proceedings of the thirteenth language resources and evaluation conference","author":"Krishnaswamy Nikhil","year":"2022","unstructured":"Nikhil Krishnaswamy, William Pickard, Brittany Cates, Nathaniel Blanchard, and James Pustejovsky. 2022. The voxworld platform for multimodal embodied agents. In Proceedings of the thirteenth language resources and evaluation conference."},{"key":"e_1_3_3_2_80_2","doi-asserted-by":"publisher","DOI":"10.1145\/3332165.3347872"},{"key":"e_1_3_3_2_81_2","unstructured":"Angeliki Lazaridou and Marco Baroni. 2020. Emergent Multi-Agent Communication in the Deep Learning Era. CoRR abs\/2006.02419 (2020). arXiv:https:\/\/arXiv.org\/abs\/2006.02419https:\/\/arxiv.org\/abs\/2006.02419"},{"key":"e_1_3_3_2_82_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR.2018.00051"},{"key":"e_1_3_3_2_83_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642230"},{"key":"e_1_3_3_2_84_2","volume-title":"Convention: A philosophical study","author":"Lewis David","year":"2008","unstructured":"David Lewis. 2008. Convention: A philosophical study. John Wiley & Sons, USA."},{"key":"e_1_3_3_2_85_2","series-title":"Proceedings of Machine Learning Research","first-page":"3061","volume-title":"Conference on Robot Learning, CoRL 2023, 6-9 November 2023, Atlanta, GA, USA","volume":"229","author":"Lin Li-Heng","year":"2023","unstructured":"Li-Heng Lin, Yuchen Cui, Yilun Hao, Fei Xia, and Dorsa Sadigh. 2023. Gesture-Informed Robot Assistance via Foundation Models. In Conference on Robot Learning, CoRL 2023, 6-9 November 2023, Atlanta, GA, USA(Proceedings of Machine Learning Research, Vol.\u00a0229). PMLR, USA, 3061\u20133082. https:\/\/proceedings.mlr.press\/v229\/lin23a.html"},{"key":"e_1_3_3_2_86_2","doi-asserted-by":"crossref","unstructured":"Yung-Ching Liu. 2001. Comparative study of the effects of auditory visual and multimodality displays on drivers\u2019 performance in advanced traveller information systems. Ergonomics 44 4 (2001) 425\u2013442.","DOI":"10.1080\/00140130010011369"},{"key":"e_1_3_3_2_87_2","volume-title":"Proceedings of the Annual Meeting of the Cognitive Science Society","author":"Maeda Kiyosu","year":"2025","unstructured":"Kiyosu Maeda, Ching-Yi Tsai, Judith\u00a0E. Fan, and Parastoo Abtahi. 2025. Using Gesture and Language to Establish Multimodal Conventions in Collaborative Physical Tasks. In Proceedings of the Annual Meeting of the Cognitive Science Society."},{"key":"e_1_3_3_2_88_2","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2017.8205971"},{"key":"e_1_3_3_2_89_2","volume-title":"Proceedings of the Annual Meeting of the Cognitive Science Society","author":"McCarthy William\u00a0P.","year":"2021","unstructured":"William\u00a0P. McCarthy, Robert\u00a0D. Hawkins, Haoliang Wang, Cameron Holdaway, and Judith\u00a0E. Fan. 2021. Learning to communicate about shared procedural abstractions. In Proceedings of the Annual Meeting of the Cognitive Science Society."},{"key":"e_1_3_3_2_90_2","volume-title":"Proceedings of the Annual Meeting of the Cognitive Science Society","volume":"42","author":"McCarthy William\u00a0P.","year":"2020","unstructured":"William\u00a0P. McCarthy, David Kirsh, and Judith\u00a0E. Fan. 2020. Learning to build physical structures better over time. In Proceedings of the Annual Meeting of the Cognitive Science Society , Vol.\u00a042."},{"key":"e_1_3_3_2_91_2","doi-asserted-by":"publisher","unstructured":"William\u00a0P. McCarthy David Kirsh and Judith\u00a0E. Fan. 2023. Consistency and Variation in Reasoning About Physical Assembly. Cognitive Science 47 12 (2023) e13397. 10.1111\/cogs.13397","DOI":"10.1111\/cogs.13397"},{"key":"e_1_3_3_2_92_2","doi-asserted-by":"publisher","DOI":"10.1145\/3635636.3664261"},{"key":"e_1_3_3_2_93_2","volume-title":"Hand and Mind: What gestures reveal about thought","author":"McNeill David","year":"1992","unstructured":"David McNeill. 1992. Hand and Mind: What gestures reveal about thought. Chicago: University of Chicago Press."},{"key":"e_1_3_3_2_94_2","volume-title":"Proceedings of SAI Intelligent Systems Conference","author":"Narayana Pradyumna","year":"2018","unstructured":"Pradyumna Narayana, Nikhil Krishnaswamy, Isaac Wang, Rahul Bangar, Dhruva Patil, Gururaj Mulay, Kyeongmin Rim, Ross Beveridge, Jaime Ruiz, James Pustejovsky, and Bruce Draper. 2018. Cooperating with avatars through gesture, language and action. In Proceedings of SAI Intelligent Systems Conference. Springer."},{"key":"e_1_3_3_2_95_2","doi-asserted-by":"crossref","unstructured":"Miriam\u00a0A. Novack Eliza\u00a0L. Congdon Naureen Hemani-Lopez and Susan Goldin-Meadow. 2014. From action to abstraction: Using the hands to learn math. Psychological science 25 4 (2014) 903\u2013910.","DOI":"10.1177\/0956797613518351"},{"key":"e_1_3_3_2_96_2","volume-title":"Proceedings of the Annual Meeting of the Cognitive Science Society","volume":"23","author":"Ono Tetsuo","year":"2001","unstructured":"Tetsuo Ono, Michita Imai, and Hiroshi Ishiguro. 2001. A model of embodied communications with gestures between human and robots. In Proceedings of the Annual Meeting of the Cognitive Science Society , Vol.\u00a023."},{"key":"e_1_3_3_2_97_2","first-page":"32","volume-title":"International Conference on Intelligent Human Computer Interaction","author":"Phukon Mridumoni","year":"2023","unstructured":"Mridumoni Phukon and Abhishek Shrivastava. 2023. Effect of Speech Entrainment in Human-Computer Conversation: A Review. In International Conference on Intelligent Human Computer Interaction. Springer, 32\u201343."},{"key":"e_1_3_3_2_98_2","doi-asserted-by":"crossref","unstructured":"Wim Pouw Mark Dingemanse Yasamin Motamedi and Asl\u0131 \u00d6zy\u00fcrek. 2021. A systematic investigation of gesture kinematics in evolving manual languages in the lab. Cognitive science 45 7 (2021) e13014.","DOI":"10.1111\/cogs.13014"},{"key":"e_1_3_3_2_99_2","volume-title":"Proceedings of the IWCS workshop on Foundations of Situated and Multimodal Communication","author":"Pustejovsky James","year":"2017","unstructured":"James Pustejovsky, Nikhil Krishnaswamy, Bruce Draper, Pradyumna Narayana, and Rahul Bangar. 2017. Creating common ground through multimodal simulations. In Proceedings of the IWCS workshop on Foundations of Situated and Multimodal Communication."},{"key":"e_1_3_3_2_100_2","volume-title":"Advances in Neural Information Processing Systems","author":"Qiu Shuwen","year":"2022","unstructured":"Shuwen Qiu, Sirui Xie, Lifeng Fan, Tao Gao, Jungseock Joo, Song-Chun Zhu, and Yixin Zhu. 2022. Emergent Graphical Conventions in a Visual Communication Game. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_3_2_101_2","doi-asserted-by":"crossref","unstructured":"Francis Quek David McNeill Robert Bryll Susan Duncan Xin-Feng Ma Cemil Kirbas Karl\u00a0E. McCullough and Rashid Ansari. 2002. Multimodal human discourse: gesture and speech. ACM Transactions on Computer-Human Interaction (TOCHI) 9 3 (2002) 171\u2013193.","DOI":"10.1145\/568513.568514"},{"key":"e_1_3_3_2_102_2","series-title":"Proceedings of Machine Learning Research","first-page":"28492","volume-title":"International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA","volume":"202","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust Speech Recognition via Large-Scale Weak Supervision. In International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA(Proceedings of Machine Learning Research, Vol.\u00a0202). PMLR, 28492\u201328518."},{"key":"e_1_3_3_2_103_2","doi-asserted-by":"publisher","unstructured":"Nicole Robinson Brendan Tidd Dylan Campbell Dana Kuli\u0107 and Peter Corke. 2023. Robotic vision for human-robot interaction and collaboration: A survey and systematic review. ACM Transactions on Human-Robot Interaction 12 1 (2023) 1\u201366. 10.48550\/ARXIV.2307.15363","DOI":"10.48550\/ARXIV.2307.15363"},{"key":"e_1_3_3_2_104_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642945"},{"key":"e_1_3_3_2_105_2","doi-asserted-by":"crossref","unstructured":"Wendy Sandler Irit Meir Carol Padden and Mark Aronoff. 2005. The emergence of grammar: Systematic structure in a new language. Proceedings of the National Academy of Sciences 102 7 (2005) 2661\u20132665.","DOI":"10.1073\/pnas.0405448102"},{"key":"e_1_3_3_2_106_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300852"},{"key":"e_1_3_3_2_107_2","doi-asserted-by":"publisher","DOI":"10.63317\/3g58v686p99j"},{"key":"e_1_3_3_2_108_2","doi-asserted-by":"crossref","unstructured":"Daniel\u00a0L. Schwartz and John\u00a0B. Black. 1996. Shuttling between depictive models and abstract rules: Induction and fallback. Cognitive science 20 4 (1996) 457\u2013497.","DOI":"10.1207\/s15516709cog2004_1"},{"key":"e_1_3_3_2_109_2","doi-asserted-by":"publisher","unstructured":"Rajeev Sharma Vladimir\u00a0I. Pavlovic and Thomas\u00a0S. Huang. 1998. Toward multimodal human-computer interface. Proc. IEEE 86 5 (1998) 853\u2013869. 10.1109\/5.664275","DOI":"10.1109\/5.664275"},{"key":"e_1_3_3_2_110_2","doi-asserted-by":"publisher","unstructured":"Amy\u00a0Lynne Shelton E.\u00a0Emory Davis Cathryn\u00a0S. Cortesa Jonathan\u00a0D. Jones Gregory\u00a0D. Hager Sanjeev Khudanpur and Barbara Landau. 2022. Characterizing the Details of Spatial Construction: Cognitive Constraints and Variability. Cogn. Sci. 46 1 (2022). 10.1111\/COGS.13081","DOI":"10.1111\/COGS.13081"},{"key":"e_1_3_3_2_111_2","volume-title":"International Conference on Learning Representations","author":"Shih Andy","year":"2021","unstructured":"Andy Shih, Arjun Sawhney, Jovana Kondic, Stefano Ermon, and Dorsa Sadigh. 2021. On the Critical Role of Conventions in Adaptive Human-AI Collaboration. In International Conference on Learning Representations."},{"key":"e_1_3_3_2_112_2","doi-asserted-by":"publisher","DOI":"10.1609\/icwsm.v14i1.7331"},{"key":"e_1_3_3_2_113_2","doi-asserted-by":"crossref","unstructured":"Darja Stoeva Andreas Kriegler and Margrit Gelautz. 2024. Body Movement Mirroring and Synchrony in Human\u2013Robot Interaction. ACM Transactions on Human-Robot Interaction 13 4 (2024) 1\u201326.","DOI":"10.1145\/3682074"},{"key":"e_1_3_3_2_114_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_6"},{"key":"e_1_3_3_2_115_2","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2017.145"},{"key":"e_1_3_3_2_116_2","first-page":"159","volume-title":"International Conference on Human-Computer Interaction","author":"Wang Isaac","year":"2021","unstructured":"Isaac Wang, Pradyumna Narayana, Dhruva Patil, Rahul Bangar, Bruce Draper, Ross Beveridge, and Jaime Ruiz. 2021. It\u2019s a Joint Effort: Understanding Speech and Gesture in Collaborative Tasks. In International Conference on Human-Computer Interaction. Springer, 159\u2013178."},{"key":"e_1_3_3_2_117_2","doi-asserted-by":"publisher","DOI":"10.1145\/3027063.3053239"},{"key":"e_1_3_3_2_118_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/P16-1224"},{"key":"e_1_3_3_2_119_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01854"},{"key":"e_1_3_3_2_120_2","doi-asserted-by":"crossref","unstructured":"Deanna Wilkes-Gibbs and Herbert\u00a0H Clark. 1992. Coordinating beliefs in conversation. Journal of memory and language 31 2 (1992) 183\u2013194.","DOI":"10.1016\/0749-596X(92)90010-U"},{"key":"e_1_3_3_2_121_2","doi-asserted-by":"publisher","unstructured":"Adam\u00a0S. Williams and Francisco\u00a0Raul Ortega. 2020. Understanding Gesture and Speech Multimodal Interactions for Manipulation Tasks in Augmented Reality Using Unconstrained Elicitation. Proc. ACM Hum. Comput. Interact. 4 ISS (2020) 202:1\u2013202:21. 10.1145\/3427330","DOI":"10.1145\/3427330"},{"key":"e_1_3_3_2_122_2","doi-asserted-by":"publisher","DOI":"10.1109\/VR58804.2024.00108"},{"key":"e_1_3_3_2_123_2","doi-asserted-by":"publisher","unstructured":"Jacob Young Tobias Langlotz Matthew Cook Steven Mills and Holger Regenbrecht. 2019. Immersive Telepresence and Remote Collaboration using Mobile and Wearable Devices. IEEE Trans. Vis. Comput. Graph. 25 5 (2019) 1908\u20131918. 10.1109\/TVCG.2019.2898737","DOI":"10.1109\/TVCG.2019.2898737"},{"key":"e_1_3_3_2_124_2","doi-asserted-by":"publisher","unstructured":"Chen Zheng and Barbara Tversky. 2024. Putting it together together. Cognitive Science 48 2 (2024) e13405. https:\/\/doi.org\/10.1111\/cogs.13405","DOI":"10.1111\/cogs.13405"}],"event":{"name":"CHI 2026: CHI Conference on Human Factors in Computing Systems","location":"Barcelona Spain","acronym":"CHI '26","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2026 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772318.3790618","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772318.3790618","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T10:02:10Z","timestamp":1776420130000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772318.3790618"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,13]]},"references-count":123,"alternative-id":["10.1145\/3772318.3790618","10.1145\/3772318"],"URL":"https:\/\/doi.org\/10.1145\/3772318.3790618","relation":{},"subject":[],"published":{"date-parts":[[2026,4,13]]},"assertion":[{"value":"2026-04-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}