{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T10:03:04Z","timestamp":1780999384295,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":166,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T00:00:00Z","timestamp":1776038400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"National Key R&D Program of China","award":["2024YFB4505500"],"award-info":[{"award-number":["2024YFB4505500"]}]},{"name":"National Key R&D Program of China","award":["2024YFB4505501"],"award-info":[{"award-number":["2024YFB4505501"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3772318.3790922","type":"proceedings-article","created":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T04:12:21Z","timestamp":1776053541000},"page":"1-24","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["GazeCoT: Unleashing Social Intelligence in Multimodal LLMs With Gaze-Informed Chain-of-Thought Reasoning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-9929-4734","authenticated-orcid":false,"given":"Zhoutong","family":"Ye","sequence":"first","affiliation":[{"name":"Department of Computer Science and Technology, BNRist, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6461-5446","authenticated-orcid":false,"given":"Xutong","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4285-7192","authenticated-orcid":false,"given":"Chengwen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1146-9474","authenticated-orcid":false,"given":"Ruiwen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Academy of Arts &amp; Design, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4205-7182","authenticated-orcid":false,"given":"Mingze","family":"Sun","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8132-9091","authenticated-orcid":false,"given":"Qinwei","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2591-7993","authenticated-orcid":false,"given":"Chun","family":"Yu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, BNRist, College of AI, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2273-6927","authenticated-orcid":false,"given":"Yuanchun","family":"Shi","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, BNRist, Tsinghua University, Beijing, China and Qinghai University, Xining, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,4,13]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"Chirag Agarwal Sree\u00a0Harsha Tanneru and Himabindu Lakkaraju. 2024. Faithfulness vs. Plausibility: On the (Un)Reliability of Explanations from Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2402.04614\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2402.04614"},{"key":"e_1_3_3_3_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/2838739.2838770"},{"key":"e_1_3_3_3_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679832"},{"key":"e_1_3_3_3_5_2","doi-asserted-by":"publisher","unstructured":"William\u00a0Sims Bainbridge Edward\u00a0E. Brent Kathleen\u00a0M. Carley David\u00a0R. Heise Michael\u00a0W. Macy Barry Markovsky and John Skvoretz. 1994. Artificial Social Intelligence. Annual Review of Sociology 20 Volume 20 1994 (1994) 407\u2013436. 10.1146\/annurev.so.20.080194.002203","DOI":"10.1146\/annurev.so.20.080194.002203"},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01373"},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"publisher","unstructured":"Andrew\u00a0P. Bayliss Jessica Bartlett Claire\u00a0K. Naughtin and Ada Kritikos. 2011. A direct link between gaze perception and social attention. Journal of Experimental Psychology: Human Perception and Performance 37 3 (2011) 634\u2013644. 10.1037\/a0020559","DOI":"10.1037\/a0020559"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3726302.3730167"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-0-387-93808-0_15"},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/VR59515.2025.00093"},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"publisher","unstructured":"Riccardo Bovo Daniele Giunchi Muna Alebri Anthony Steed Enrico Costanza and Thomas Heinis. 2022. Cone of Vision as a Behavioural Cue for VR Collaboration. Proc. ACM Hum.-Comput. Interact. 6 CSCW2 Article 502 (Nov. 2022) 27\u00a0pages. 10.1145\/3555615","DOI":"10.1145\/3555615"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581283"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","unstructured":"Rechele Brooks and Andrew\u00a0N. Meltzoff. 2005. The development of gaze following and its relation to language. Developmental Science 8 6 (2005) 535\u2013543. 10.1111\/j.1467-7687.2005.00445.x","DOI":"10.1111\/j.1467-7687.2005.00445.x"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.807"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01227"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713953"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613905.3650868"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706599.3719852"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676375"},{"key":"e_1_3_3_3_20_2","unstructured":"Boxu Chen Ziwei Zheng Le Yang Zeyu Geng Zhengyu Zhao Chenhao Lin and Chao Shen. 2025. Seeing It or Not? Interpretable Vision-aware Latent Steering to Mitigate Object Hallucinations. arxiv:https:\/\/arXiv.org\/abs\/2505.17812\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2505.17812"},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"publisher","unstructured":"Huili Chen Yubin Kim Kejia Patterson Cynthia Breazeal and Hae\u00a0Won Park. 2025. Social robots as conversational catalysts: Enhancing long-term human-human interaction at home. Science Robotics 10 100 (2025) eadk3307. 10.1126\/scirobotics.adk3307","DOI":"10.1126\/scirobotics.adk3307"},{"key":"e_1_3_3_3_22_2","unstructured":"Qiguang Chen Libo Qin Jinhao Liu Dengyun Peng Jiannan Guan Peng Wang Mengkang Hu Yuhang Zhou Te Gao and Wanxiang Che. 2025. Towards Reasoning Era: A Survey of Long Chain-of-Thought for Reasoning Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2503.09567\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2503.09567"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"publisher","unstructured":"Wenhe Chen Hui Xu Chao Zhu Xiaoli Liu Yinghua Lu Caixia Zheng and Jun Kong. 2022. Gaze Estimation via the Joint Modeling of Multiple Cues. IEEE Transactions on Circuits and Systems for Video Technology 32 3 (2022) 1390\u20131402. 10.1109\/TCSVT.2021.3071621","DOI":"10.1109\/TCSVT.2021.3071621"},{"key":"e_1_3_3_3_24_2","series-title":"(ICML\u201924)","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Chen Yanda","year":"2024","unstructured":"Yanda Chen, Ruiqi Zhong, Narutatsu Ri, Chen Zhao, He He, Jacob Steinhardt, Zhou Yu, and Kathleen McKeown. 2024. Do models explain themselves? counterfactual simulatability of natural language explanations. In Proceedings of the 41st International Conference on Machine Learning (Vienna, Austria) (ICML\u201924). JMLR.org, Article 310, 25\u00a0pages."},{"key":"e_1_3_3_3_25_2","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu Lixin Gu Xuehui Wang Qingyun Li Yimin Ren Zixuan Chen Jiapeng Luo Jiahao Wang Tan Jiang Bo Wang Conghui He Botian Shi Xingcheng Zhang Han Lv Yi Wang Wenqi Shao Pei Chu Zhongying Tu Tong He Zhiyong Wu Huipeng Deng Jiaye Ge Kai Chen Kaipeng Zhang Limin Wang Min Dou Lewei Lu Xizhou Zhu Tong Lu Dahua Lin Yu Qiao Jifeng Dai and Wenhai Wang. 2025. Expanding Performance Boundaries of Open-Source Multimodal Models with Model Data and Test-Time Scaling. arxiv:https:\/\/arXiv.org\/abs\/2412.05271\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2412.05271"},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.847"},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3610978.3640669"},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00544"},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"publisher","unstructured":"Nicholas\u00a0Kluge Corr\u00eaa Camila Galv\u00e3o James\u00a0William Santos Carolina Del\u00a0Pino Edson\u00a0Pontes Pinto Camila Barbosa Diogo Massmann Rodrigo Mambrini Luiza Galv\u00e3o Edmund Terem and Nythamar de Oliveira. 2023. Worldwide AI ethics: A review of 200 guidelines and recommendations for AI governance. Patterns 4 10 (2023). 10.1016\/j.patter.2023.100857","DOI":"10.1016\/j.patter.2023.100857"},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676379"},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3715668.3736363"},{"key":"e_1_3_3_3_32_2","series-title":"(ICML\u201923)","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Driess Danny","year":"2023","unstructured":"Danny Driess, Fei Xia, Mehdi S.\u00a0M. Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, Wenlong Huang, Yevgen Chebotar, Pierre Sermanet, Daniel Duckworth, Sergey Levine, Vincent Vanhoucke, Karol Hausman, Marc Toussaint, Klaus Greff, Andy Zeng, Igor Mordatch, and Pete Florence. 2023. PaLM-E: an embodied multimodal language model. In Proceedings of the 40th International Conference on Machine Learning (Honolulu, Hawaii, USA) (ICML\u201923). JMLR.org, Article 340, 20\u00a0pages."},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00713"},{"key":"e_1_3_3_3_34_2","unstructured":"Robert\u00a0M Emerson Rachel\u00a0I Fretz and Linda\u00a0L Shaw. 1995. Writing ethnographic fieldnotes. Chicago guides to writing editing. and publishing. Chicago IL: University of Chicago Press. Feld S. & Brenneis D.(2004). Doing anthropology in sound. American Ethnologist 31 4 (1995) 461\u2013474."},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"publisher","unstructured":"N.J. Emery. 2000. The eyes have it: the neuroethology function and evolution of social gaze. Neuroscience & Biobehavioral Reviews 24 6 (2000) 581\u2013604. 10.1016\/S0149-7634(00)00025-7","DOI":"10.1016\/S0149-7634(00)00025-7"},{"key":"e_1_3_3_3_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00676"},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00723"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"crossref","unstructured":"Lifeng Fan Manjie Xu Zhihao Cao Yixin Zhu and Song-Chun Zhu. 2022. Artificial social intelligence: A comparative and holistic view. CAAI Artificial Intelligence Research 1 2 (2022) 144\u2013160.","DOI":"10.26599\/AIR.2022.9150010"},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01123"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0685"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","unstructured":"Alexandra Frischen Andrew\u00a0P. Bayliss and Steven\u00a0P. Tipper. 2007. Gaze cueing of attention: Visual attention social cognition and individual differences. Psychological Bulletin 133 4 (2007) 694\u2013724. 10.1037\/0033-2909.133.4.694PsycINFO Database Record (c) 2025 APA all rights reserved.","DOI":"10.1037\/0033-2909.133.4.694"},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"publisher","unstructured":"Yuyang Gao Tong\u00a0Steven Sun Liang Zhao and Sungsoo\u00a0Ray Hong. 2022. Aligning Eyes between Humans and Deep Neural Network through Interactive Attention Alignment. Proc. ACM Hum.-Comput. Interact. 6 CSCW2 Article 489 (Nov. 2022) 28\u00a0pages. 10.1145\/3555590","DOI":"10.1145\/3555590"},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"publisher","unstructured":"Khashayar Ghamati Maryam\u00a0Banitalebi Dehkordi and Abolfazl Zaraki. 2025. Which AI Sees Like Us? Investigating the Cognitive Plausibility of Language and Vision Models via Eye-Tracking in Human-Robot Interaction. Sensors 25 15 (2025). 10.3390\/s25154687","DOI":"10.3390\/s25154687"},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","unstructured":"Jiawei Gu Xuhui Jiang Zhichao Shi Hexiang Tan Xuehao Zhai Chengjin Xu Wei Li Yinghan Shen Shengjie Ma Honghao Liu Saizhuo Wang Kun Zhang Zhouchi Lin Bowen Zhang Lionel Ni Wen Gao Yuanzhuo Wang and Jian Guo. 2025. A survey on LLM-as-a-Judge. The Innovation (2025). 10.1016\/j.xinn.2025.101253doi: 10.1016\/j.xinn.2025.101253.","DOI":"10.1016\/j.xinn.2025.101253"},{"key":"e_1_3_3_3_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3663548.3675658"},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01309"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73010-8_23"},{"key":"e_1_3_3_3_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00552"},{"key":"e_1_3_3_3_49_2","doi-asserted-by":"crossref","unstructured":"Peter\u00a0A Hancock Deborah\u00a0R Billings Kristin\u00a0E Schaefer Jessie\u00a0YC Chen Ewart\u00a0J De\u00a0Visser and Raja Parasuraman. 2011. A meta-analysis of factors affecting trust in human-robot interaction. Human factors 53 5 (2011) 517\u2013527.","DOI":"10.1177\/0018720811417254"},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","unstructured":"Robert\u00a0R. Hoffman Shane\u00a0T. Mueller Gary Klein and Jordan Litman. 2023. Measures for explainable AI: Explanation goodness user satisfaction mental models curiosity trust and human-AI performance. Frontiers in Computer Science Volume 5 - 2023 (2023). 10.3389\/fcomp.2023.1096257","DOI":"10.3389\/fcomp.2023.1096257"},{"key":"e_1_3_3_3_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/302979.303030"},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"publisher","unstructured":"Baosheng\u00a0James Hou Joshua Newn Ludwig Sidenmark Anam\u00a0Ahmad Khan and Hans Gellersen. 2024. GazeSwitch: Automatic Eye-Head Mode Switching for Optimised Hands-Free Pointing. Proc. ACM Hum.-Comput. Interact. 8 ETRA Article 227 (May 2024) 20\u00a0pages. 10.1145\/3655601","DOI":"10.1145\/3655601"},{"key":"e_1_3_3_3_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/3715336.3735769"},{"key":"e_1_3_3_3_54_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.306"},{"key":"e_1_3_3_3_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02775"},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"publisher","unstructured":"Runhui Huang Yanxin Long Jianhua Han Hang Xu Xiwen Liang Chunjing Xu and Xiaodan Liang. 2023. NLIP: Noise-Robust Language-Image Pre-training. Proceedings of the AAAI Conference on Artificial Intelligence 37 1 (Jun. 2023) 926\u2013934. 10.1609\/aaai.v37i1.25172","DOI":"10.1609\/aaai.v37i1.25172"},{"key":"e_1_3_3_3_57_2","first-page":"20067","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Jiang Biao","year":"2023","unstructured":"Biao Jiang, Xin Chen, Wen Liu, Jingyi Yu, Gang Yu, and Tao Chen. 2023. MotionGPT: Human Motion as a Foreign Language. In Advances in Neural Information Processing Systems , A.\u00a0Oh, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 20067\u201320079. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/3fbf0c1ea0716c03dea93bb6be78dd6f-Paper-Conference.pdf"},{"key":"e_1_3_3_3_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01113"},{"key":"e_1_3_3_3_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376219"},{"key":"e_1_3_3_3_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713530"},{"key":"e_1_3_3_3_61_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72384-1_18"},{"key":"e_1_3_3_3_62_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_3_63_2","first-page":"16509","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Koley Subhadeep","year":"2024","unstructured":"Subhadeep Koley, Ayan\u00a0Kumar Bhunia, Aneeshan Sain, Pinaki\u00a0Nath Chowdhury, Tao Xiang, and Yi-Zhe Song. 2024. You\u2019ll Never Walk Alone: A Sketch and Text Duet for Fine-Grained Image Retrieval. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 16509\u201316519."},{"key":"e_1_3_3_3_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501865"},{"key":"e_1_3_3_3_65_2","doi-asserted-by":"publisher","unstructured":"Dennis K\u00fcster Eva\u00a0G. Krumhuber Lars Steinert Anuj Ahuja Marc Baker and Tanja Schultz. 2020. Opportunities and Challenges for Using Automatic Human Affect Analysis in Consumer Research. Frontiers in Neuroscience Volume 14 - 2020 (2020). 10.3389\/fnins.2020.00400","DOI":"10.3389\/fnins.2020.00400"},{"key":"e_1_3_3_3_66_2","doi-asserted-by":"publisher","unstructured":"Stephen R.\u00a0H. Langton Roger\u00a0J. Watt and Vicki Bruce. 2000. Do the eyes have it? Cues to the direction of social attention. Trends in Cognitive Sciences 4 2 (Feb. 2000) 50\u201359. 10.1016\/S1364-6613(99)01436-9","DOI":"10.1016\/S1364-6613(99)01436-9"},{"key":"e_1_3_3_3_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746059.3747748"},{"key":"e_1_3_3_3_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642230"},{"key":"e_1_3_3_3_69_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706599.3720236"},{"key":"e_1_3_3_3_70_2","doi-asserted-by":"crossref","unstructured":"John\u00a0D Lee and Katrina\u00a0A See. 2004. Trust in automation: Designing for appropriate reliance. Human factors 46 1 (2004) 50\u201380.","DOI":"10.1518\/hfes.46.1.50.30392"},{"key":"e_1_3_3_3_71_2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1868"},{"key":"e_1_3_3_3_72_2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0542"},{"key":"e_1_3_3_3_73_2","unstructured":"Bo Li Yuanhan Zhang Dong Guo Renrui Zhang Feng Li Hao Zhang Kaichen Zhang Peiyuan Zhang Yanwei Li Ziwei Liu and Chunyuan Li. 2024. LLaVA-OneVision: Easy Visual Task Transfer. arxiv:https:\/\/arXiv.org\/abs\/2408.03326\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2408.03326"},{"key":"e_1_3_3_3_74_2","doi-asserted-by":"publisher","unstructured":"Hao Li Hao Fei Zechao Hu Zhengwei Yang and Zheng Wang. 2025. VEGAS: Towards Visually Explainable and Grounded Artificial Social Intelligence. Proceedings of the AAAI Conference on Artificial Intelligence 39 5 (Apr. 2025) 4707\u20134715. 10.1609\/aaai.v39i5.32497","DOI":"10.1609\/aaai.v39i5.32497"},{"key":"e_1_3_3_3_75_2","series-title":"Proceedings of Machine Learning Research","first-page":"19730","volume-title":"Proceedings of the 40th International Conference on Machine Learning","volume":"202","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In Proceedings of the 40th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 19730\u201319742. https:\/\/proceedings.mlr.press\/v202\/li23q.html"},{"key":"e_1_3_3_3_76_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3754993"},{"key":"e_1_3_3_3_77_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681598"},{"key":"e_1_3_3_3_78_2","first-page":"34892","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong\u00a0Jae Lee. 2023. Visual Instruction Tuning. In Advances in Neural Information Processing Systems , A.\u00a0Oh, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 34892\u201334916. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/6dcf277ea32ce3288914faf369fe6de0-Paper-Conference.pdf"},{"key":"e_1_3_3_3_79_2","unstructured":"Hanchao Liu Wenyuan Xue Yifei Chen Dapeng Chen Xiutian Zhao Ke Wang Liping Hou Rongjun Li and Wei Peng. 2024. A Survey on Hallucination in Large Vision-Language Models. arxiv:https:\/\/arXiv.org\/abs\/2402.00253\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2402.00253"},{"key":"e_1_3_3_3_80_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746059.3747653"},{"key":"e_1_3_3_3_81_2","volume-title":"International Conference on Learning Representations","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. SGDR: Stochastic Gradient Descent with Warm Restarts. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=Skq89Scxx"},{"key":"e_1_3_3_3_82_2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1528"},{"key":"e_1_3_3_3_83_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676331"},{"key":"e_1_3_3_3_84_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3714235"},{"key":"e_1_3_3_3_85_2","doi-asserted-by":"publisher","DOI":"10.1145\/1357054.1357072"},{"key":"e_1_3_3_3_86_2","doi-asserted-by":"publisher","unstructured":"Aoran Mei Guo-Niu Zhu Huaxiang Zhang and Zhongxue Gan. 2024. ReplanVLM: Replanning Robotic Tasks With Visual Language Models. IEEE Robotics and Automation Letters 9 11 (2024) 10201\u201310208. 10.1109\/LRA.2024.3471457","DOI":"10.1109\/LRA.2024.3471457"},{"key":"e_1_3_3_3_87_2","doi-asserted-by":"publisher","unstructured":"Michael Merry Pat Riddle and Jim Warren. 2021. A mental models approach for defining explainable artificial intelligence. BMC Medical Informatics and Decision Making 21 1 (2021) 344. 10.1186\/s12911-021-01703-7","DOI":"10.1186\/s12911-021-01703-7"},{"key":"e_1_3_3_3_88_2","doi-asserted-by":"publisher","DOI":"10.1145\/3025171.3025195"},{"key":"e_1_3_3_3_89_2","doi-asserted-by":"publisher","unstructured":"Tim Miller. 2019. Explanation in artificial intelligence: Insights from the social sciences. Artificial Intelligence 267 (2019) 1\u201338. 10.1016\/j.artint.2018.07.007","DOI":"10.1016\/j.artint.2018.07.007"},{"key":"e_1_3_3_3_90_2","unstructured":"Seyed\u00a0Mahed Mousavi Edoardo Cecchinato Lucia Hornikova and Giuseppe Riccardi. 2025. Garbage In Reasoning Out? Why Benchmark Scores are Unreliable and What to Do About It. arxiv:https:\/\/arXiv.org\/abs\/2506.23864\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2506.23864"},{"key":"e_1_3_3_3_91_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01881"},{"key":"e_1_3_3_3_92_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-16-7498-3_9"},{"key":"e_1_3_3_3_93_2","doi-asserted-by":"publisher","unstructured":"Lauri Nummenmaa and Andrew\u00a0J. Calder. 2009. Neural mechanisms of social attention. Trends in Cognitive Sciences 13 3 (March 2009) 135\u2013143. 10.1016\/j.tics.2008.12.006","DOI":"10.1016\/j.tics.2008.12.006"},{"key":"e_1_3_3_3_94_2","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy\u00a0V. Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel HAZIZA Francisco Massa Alaaeldin El-Nouby Mido Assran Nicolas Ballas Wojciech Galuba Russell Howes Po-Yao Huang Shang-Wen Li Ishan Misra Michael Rabbat Vasu Sharma Gabriel Synnaeve Hu Xu Herve Jegou Julien Mairal Patrick Labatut Armand Joulin and Piotr Bojanowski. 2024. DINOv2: Learning Robust Visual Features without Supervision. Transactions on Machine Learning Research (2024). https:\/\/openreview.net\/forum?id=a68SUt6zFt Featured Certification."},{"key":"e_1_3_3_3_95_2","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2016.7759741"},{"key":"e_1_3_3_3_96_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713726"},{"key":"e_1_3_3_3_97_2","doi-asserted-by":"publisher","unstructured":"Kevin\u00a0A Pelphrey Jeffrey\u00a0D Singerman Truett Allison and Gregory McCarthy. 2003. Brain activation evoked by perception of gaze shifts: the influence of context. Neuropsychologia 41 2 (2003) 156\u2013170. 10.1016\/S0028-3932(02)00146-XThe cognitive neuroscience of social behavior.","DOI":"10.1016\/S0028-3932(02)00146-X"},{"key":"e_1_3_3_3_98_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2018.8545068"},{"key":"e_1_3_3_3_99_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642619"},{"key":"e_1_3_3_3_100_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445315"},{"key":"e_1_3_3_3_101_2","doi-asserted-by":"publisher","unstructured":"Tingting Qiao Jianfeng Dong and Duanqing Xu. 2018. Exploring Human-Like Attention Supervision in Visual Question Answering. Proceedings of the AAAI Conference on Artificial Intelligence 32 1 (Apr. 2018). 10.1609\/aaai.v32i1.12272","DOI":"10.1609\/aaai.v32i1.12272"},{"key":"e_1_3_3_3_102_2","doi-asserted-by":"publisher","DOI":"10.1145\/3448017.3457381"},{"key":"e_1_3_3_3_103_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Ravi Nikhila","year":"2025","unstructured":"Nikhila Ravi, Valentin Gabeur, Yuan-Ting Hu, Ronghang Hu, Chaitanya Ryali, Tengyu Ma, Haitham Khedr, Roman R\u00e4dle, Chloe Rolland, Laura Gustafson, Eric Mintun, Junting Pan, Kalyan\u00a0Vasudev Alwala, Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr Dollar, and Christoph Feichtenhofer. 2025. SAM 2: Segment Anything in Images and Videos. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=Ha6RTeWMd0"},{"key":"e_1_3_3_3_104_2","volume-title":"Advances in Neural Information Processing Systems","author":"Recasens Adria","year":"2015","unstructured":"Adria Recasens, Aditya Khosla, Carl Vondrick, and Antonio Torralba. 2015. Where are they looking?. In Advances in Neural Information Processing Systems , C.\u00a0Cortes, N.\u00a0Lawrence, D.\u00a0Lee, M.\u00a0Sugiyama, and R.\u00a0Garnett (Eds.), Vol.\u00a028. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2015\/file\/ec8956637a99787bd197eacd77acce5e-Paper.pdf"},{"key":"e_1_3_3_3_105_2","doi-asserted-by":"publisher","DOI":"10.1145\/3745900.3746075"},{"key":"e_1_3_3_3_106_2","doi-asserted-by":"crossref","unstructured":"Evan\u00a0F Risko Daniel\u00a0C Richardson and Alan Kingstone. 2016. Breaking the fourth wall of cognitive science: Real-world social attention and the dual function of gaze. Current Directions in Psychological Science 25 1 (2016) 70\u201374.","DOI":"10.1177\/0963721415617806"},{"key":"e_1_3_3_3_107_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02689"},{"key":"e_1_3_3_3_108_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1454"},{"key":"e_1_3_3_3_109_2","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2018.8593580"},{"key":"e_1_3_3_3_110_2","doi-asserted-by":"publisher","unstructured":"Debajit Sarma and Manas\u00a0Kamal Bhuyan. 2021. Methods databases and recent advancement of vision-based hand gesture recognition for hci systems: A review. SN Computer Science 2 6 (2021) 436. https:\/\/doi.org\/10.1007\/s42979-021-00827-x","DOI":"10.1007\/s42979-021-00827-x"},{"key":"e_1_3_3_3_111_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713910"},{"key":"e_1_3_3_3_112_2","doi-asserted-by":"publisher","unstructured":"Fiona\u00a0Louise Scott. 2022. Family mediation of preschool children\u2019s digital media practices at home. Learning Media and Technology 47 2 (2022) 235\u2013250. 10.1080\/17439884.2021.1960859","DOI":"10.1080\/17439884.2021.1960859"},{"key":"e_1_3_3_3_113_2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1361"},{"key":"e_1_3_3_3_114_2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0275"},{"key":"e_1_3_3_3_115_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.312"},{"key":"e_1_3_3_3_116_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706599.3720215"},{"key":"e_1_3_3_3_117_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01101"},{"key":"e_1_3_3_3_118_2","doi-asserted-by":"publisher","unstructured":"Ludwig Sidenmark Franziska Prummer Joshua Newn and Hans Gellersen. 2023. Comparing Gaze Head and Controller Selection of Dynamically Revealed Targets in Head-Mounted Displays. IEEE Transactions on Visualization and Computer Graphics 29 11 (2023) 4740\u20134750. 10.1109\/TVCG.2023.3320235","DOI":"10.1109\/TVCG.2023.3320235"},{"key":"e_1_3_3_3_119_2","unstructured":"Oriane Sim\u00e9oni Huy\u00a0V. Vo Maximilian Seitzer Federico Baldassarre Maxime Oquab Cijo Jose Vasil Khalidov Marc Szafraniec Seungeun Yi Micha\u00ebl Ramamonjisoa Francisco Massa Daniel Haziza Luca Wehrstedt Jianyuan Wang Timoth\u00e9e Darcet Th\u00e9o Moutakanni Leonel Sentana Claire Roberts Andrea Vedaldi Jamie Tolan John Brandt Camille Couprie Julien Mairal Herv\u00e9 J\u00e9gou Patrick Labatut and Piotr Bojanowski. 2025. DINOv3. arxiv:https:\/\/arXiv.org\/abs\/2508.10104\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2508.10104"},{"key":"e_1_3_3_3_120_2","doi-asserted-by":"publisher","unstructured":"Daeun Song Jing Liang Amirreza Payandeh Amir\u00a0Hossain Raj Xuesu Xiao and Dinesh Manocha. 2025. VLM-Social-Nav: Socially Aware Robot Navigation Through Scoring Using Vision-Language Models. IEEE Robotics and Automation Letters 10 1 (2025) 508\u2013515. 10.1109\/LRA.2024.3511409","DOI":"10.1109\/LRA.2024.3511409"},{"key":"e_1_3_3_3_121_2","doi-asserted-by":"crossref","unstructured":"Lisa\u00a0J Stephenson S\u00a0Gareth Edwards and Andrew\u00a0P Bayliss. 2021. From gaze perception to social cognition: The shared-attention system. Perspectives on Psychological Science 16 3 (2021) 553\u2013576.","DOI":"10.1177\/1745691620953773"},{"key":"e_1_3_3_3_122_2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3858"},{"key":"e_1_3_3_3_123_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01914"},{"key":"e_1_3_3_3_124_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746059.3747597"},{"key":"e_1_3_3_3_125_2","unstructured":"Yunjie Tian Qixiang Ye and David Doermann. 2025. YOLOv12: Attention-Centric Real-Time Object Detectors. arxiv:https:\/\/arXiv.org\/abs\/2502.12524\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2502.12524"},{"key":"e_1_3_3_3_126_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00349"},{"key":"e_1_3_3_3_127_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3714294"},{"key":"e_1_3_3_3_128_2","first-page":"74952","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Turpin Miles","year":"2023","unstructured":"Miles Turpin, Julian Michael, Ethan Perez, and Samuel Bowman. 2023. Language Models Don't Always Say What They Think: Unfaithful Explanations in Chain-of-Thought Prompting. In Advances in Neural Information Processing Systems , A.\u00a0Oh, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 74952\u201374965. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/ed3fea9033a80fea1376299fa7863f4a-Paper-Conference.pdf"},{"key":"e_1_3_3_3_129_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2014.7025309"},{"key":"e_1_3_3_3_130_2","doi-asserted-by":"publisher","unstructured":"Franz\u00a0A. Van-Horenbeke and Angelika Peer. 2021. Activity Plan and Goal Recognition: A Review. Frontiers in Robotics and AI Volume 8 - 2021 (2021). 10.3389\/frobt.2021.643010","DOI":"10.3389\/frobt.2021.643010"},{"key":"e_1_3_3_3_131_2","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141\u00a0ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems , I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.), Vol.\u00a030. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_3_3_132_2","doi-asserted-by":"publisher","DOI":"10.1145\/302979.303065"},{"key":"e_1_3_3_3_133_2","doi-asserted-by":"publisher","unstructured":"Alessandro Vinciarelli Maja Pantic and Herv\u00e9 Bourlard. 2009. Social signal processing: Survey of an emerging domain. Image and Vision Computing 27 12 (2009) 1743\u20131759. 10.1016\/j.imavis.2008.11.007Visual and multimodal analysis of human spontaneous behaviour:.","DOI":"10.1016\/j.imavis.2008.11.007"},{"key":"e_1_3_3_3_134_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.331"},{"key":"e_1_3_3_3_135_2","doi-asserted-by":"crossref","unstructured":"Sandra Wachter. 2024. Limitations and loopholes in the EU AI Act and AI Liability Directives: what this means for the European Union the United States and beyond. Yale Journal of Law and Technology 26 3 (2024) 671\u2013718.","DOI":"10.2139\/ssrn.4924553"},{"key":"e_1_3_3_3_136_2","volume-title":"Proceedings of the Annual Meeting of the Cognitive Science Society","volume":"46","author":"Wang Junqi","year":"2024","unstructured":"Junqi Wang, Chunhui Zhang, Jiapeng Li, Yuxi Ma, Lixing Niu, Jiaheng Han, Yujia Peng, Yixin Zhu, and Lifeng Fan. 2024. Evaluating and Modeling Social Intelligence: A Comparative Study of Human and AI Capabilities. In Proceedings of the Annual Meeting of the Cognitive Science Society , Vol.\u00a046. https:\/\/escholarship.org\/uc\/item\/2j53v5nv"},{"key":"e_1_3_3_3_137_2","doi-asserted-by":"publisher","unstructured":"Xukang Wang Ying\u00a0Cheng Wu Mengjie Zhou and Hongpeng Fu. 2024. Beyond surveillance: privacy ethics and regulations in face recognition technology. Frontiers in Big Data Volume 7 - 2024 (2024). 10.3389\/fdata.2024.1337465","DOI":"10.3389\/fdata.2024.1337465"},{"key":"e_1_3_3_3_138_2","doi-asserted-by":"publisher","unstructured":"Zeyu Wang Yuanchun Shi Yuntao Wang Yuchen Yao Kun Yan Yuhan Wang Lei Ji Xuhai Xu and Chun Yu. 2024. G-VOILA: Gaze-Facilitated Information Querying in Daily Scenarios. Proc. ACM Interact. Mob. Wearable Ubiquitous Technol. 8 2 Article 78 (May 2024) 33\u00a0pages. 10.1145\/3659623","DOI":"10.1145\/3659623"},{"key":"e_1_3_3_3_139_2","unstructured":"Laura Weidinger John Mellor Maribeth Rauh Conor Griffin Jonathan Uesato Po-Sen Huang Myra Cheng Mia Glaese Borja Balle Atoosa Kasirzadeh Zac Kenton Sasha Brown Will Hawkins Tom Stepleton Courtney Biles Abeba Birhane Julia Haas Laura Rimell Lisa\u00a0Anne Hendricks William Isaac Sean Legassick Geoffrey Irving and Iason Gabriel. 2021. Ethical and social risks of harm from Language Models. arxiv:https:\/\/arXiv.org\/abs\/2112.04359\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2112.04359"},{"key":"e_1_3_3_3_140_2","doi-asserted-by":"publisher","unstructured":"Glenn\u00a0E. Weisfeld and Jody\u00a0M. Beresford. 1982. Erectness of posture as an indicator of dominance or success in humans. Motivation and Emotion 6 2 (1982) 113\u2013131. 10.1007\/BF00992459","DOI":"10.1007\/BF00992459"},{"key":"e_1_3_3_3_141_2","doi-asserted-by":"publisher","DOI":"10.1145\/3715669.3727349"},{"key":"e_1_3_3_3_142_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73661-2_12"},{"key":"e_1_3_3_3_143_2","doi-asserted-by":"publisher","unstructured":"Claire Woodcock Brent Mittelstadt Dan Busbridge and Grant Blank. 2021. The Impact of Explanations on Layperson Trust in Artificial Intelligence\u2013Driven Symptom Checker Apps: Experimental Study. J Med Internet Res 23 11 (3 Nov 2021) e29386. 10.2196\/29386","DOI":"10.2196\/29386"},{"key":"e_1_3_3_3_144_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-long.75"},{"key":"e_1_3_3_3_145_2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0060"},{"key":"e_1_3_3_3_146_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00994"},{"key":"e_1_3_3_3_147_2","doi-asserted-by":"publisher","unstructured":"Yifan Yao Jinhao Duan Kaidi Xu Yuanfang Cai Zhibo Sun and Yue Zhang. 2024. A survey on large language model (LLM) security and privacy: The Good The Bad and The Ugly. High-Confidence Computing 4 2 (2024) 100211. 10.1016\/j.hcc.2024.100211","DOI":"10.1016\/j.hcc.2024.100211"},{"key":"e_1_3_3_3_148_2","unstructured":"Zhoutong Ye Mingze Sun Huan ang Gao Xutong Wang Xiangyang Wang Yu Mei Chang Liu Qinwei Li Chengwen Zhang Qinghuan Lan Chun Yu and Yuanchun Shi. 2025. MOAT: Evaluating LMMs for Capability Integration and Instruction Grounding. arxiv:https:\/\/arXiv.org\/abs\/2503.09348\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2503.09348"},{"key":"e_1_3_3_3_149_2","doi-asserted-by":"publisher","unstructured":"Kate Yen Yeqi Chen Yi Cheng Sijin Chen Ying-Yu Chen Yiran Ni and Alexis Hiniker. 2018. Joint Media Engagement between Parents and Preschoolers in the U.S. China and Taiwan. Proc. ACM Hum.-Comput. Interact. 2 CSCW Article 192 (Nov. 2018) 19\u00a0pages. 10.1145\/3274461","DOI":"10.1145\/3274461"},{"key":"e_1_3_3_3_150_2","doi-asserted-by":"publisher","unstructured":"Shukang Yin Chaoyou Fu Sirui Zhao Ke Li Xing Sun Tong Xu and Enhong Chen. 2024. A survey on multimodal large language models. National Science Review 11 12 (11 2024) nwae403. 10.1093\/nsr\/nwae403","DOI":"10.1093\/nsr\/nwae403"},{"key":"e_1_3_3_3_151_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642307"},{"key":"e_1_3_3_3_152_2","series-title":"(ICML\u201924)","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Yu Weihao","year":"2024","unstructured":"Weihao Yu, Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Zicheng Liu, Xinchao Wang, and Lijuan Wang. 2024. MM-Vet: evaluating large multimodal models for integrated capabilities. In Proceedings of the 41st International Conference on Machine Learning (Vienna, Austria) (ICML\u201924). JMLR.org, Article 2381, 25\u00a0pages."},{"key":"e_1_3_3_3_153_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.648"},{"key":"e_1_3_3_3_154_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713265"},{"key":"e_1_3_3_3_155_2","doi-asserted-by":"publisher","unstructured":"Xin Zeng Xiaoyu Wang Tengxiang Zhang Chun Yu Shengdong Zhao and Yiqiang Chen. 2024. GestureGPT: Toward Zero-Shot Free-Form Hand Gesture Understanding with Large Language Model Agents. Proc. ACM Hum.-Comput. Interact. 8 ISS Article 545 (Oct. 2024) 38\u00a0pages. 10.1145\/3698145","DOI":"10.1145\/3698145"},{"key":"e_1_3_3_3_156_2","doi-asserted-by":"publisher","unstructured":"John Zerilli Umang Bhatt and Adrian Weller. 2022. How transparency modulates trust in artificial intelligence. Patterns 3 4 (April 2022). 10.1016\/j.patter.2022.100455","DOI":"10.1016\/j.patter.2022.100455"},{"key":"e_1_3_3_3_157_2","series-title":"(IJCAI\u201920)","volume-title":"Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence","author":"Zhang Ruohan","year":"2021","unstructured":"Ruohan Zhang, Akanksha Saran, Bo Liu, Yifeng Zhu, Sihang Guo, Scott Niekum, Dana Ballard, and Mary Hayhoe. 2021. Human gaze assisted artificial intelligence: a review. In Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence (Yokohama, Japan) (IJCAI\u201920). Article 689, 8\u00a0pages."},{"key":"e_1_3_3_3_158_2","doi-asserted-by":"publisher","unstructured":"Xu Zhang Kailun Yang Jiacheng Lin Jin Yuan Zhiyong Li and Shutao Li. 2024. PVPUFormer: Probabilistic Visual Prompt Unified Transformer for Interactive Image Segmentation. IEEE Transactions on Image Processing 33 (2024) 6455\u20136468. 10.1109\/TIP.2024.3492713","DOI":"10.1109\/TIP.2024.3492713"},{"key":"e_1_3_3_3_159_2","doi-asserted-by":"publisher","unstructured":"Yue Zhang Yafu Li Leyang Cui Deng Cai Lemao Liu Tingchen Fu Xinting Huang Enbo Zhao Yu Zhang Yulong Chen Longyue Wang Anh\u00a0Tuan Luu Wei Bi Freda Shi and Shuming Shi. 2025. Siren\u2019s Song in the AI Ocean: A Survey on Hallucination in Large Language Models. Computational Linguistics 51 4 (12 2025) 1373\u20131418. 10.1162\/COLI.a.16","DOI":"10.1162\/COLI.a.16"},{"key":"e_1_3_3_3_160_2","doi-asserted-by":"publisher","unstructured":"Yanxia Zhang Ken Pfeuffer Ming\u00a0Ki Chong Jason Alexander Andreas Bulling and Hans Gellersen. 2017. Look together: using gaze for assisting co-located collaborative search. Personal Ubiquitous Comput. 21 1 (Feb. 2017) 173\u2013186. 10.1007\/s00779-016-0969-x","DOI":"10.1007\/s00779-016-0969-x"},{"key":"e_1_3_3_3_161_2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1639"},{"key":"e_1_3_3_3_162_2","doi-asserted-by":"crossref","unstructured":"Zory Zhang Pinyuan Feng Bingyang Wang Tianwei Zhao Suyang Yu Qingying Gao Hokin Deng Ziqiao Ma Yijiang Li and Dezhi Luo. 2025. Can Vision Language Models Infer Human Gaze Direction? A Controlled Study. arxiv:https:\/\/arXiv.org\/abs\/2506.05412\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2506.05412","DOI":"10.31219\/osf.io\/c9xvn_v3"},{"key":"e_1_3_3_3_163_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713289"},{"key":"e_1_3_3_3_164_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73337-6_8"},{"key":"e_1_3_3_3_165_2","doi-asserted-by":"publisher","unstructured":"Ce Zheng Wenhan Wu Chen Chen Taojiannan Yang Sijie Zhu Ju Shen Nasser Kehtarnavaz and Mubarak Shah. 2023. Deep Learning-based Human Pose Estimation: A Survey. ACM Comput. Surv. 56 1 Article 11 (Aug. 2023) 37\u00a0pages. 10.1145\/3603618","DOI":"10.1145\/3603618"},{"key":"e_1_3_3_3_166_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"e_1_3_3_3_167_2","volume-title":"The Twelfth International Conference on Learning Representations","author":"Zhu Deyao","year":"2024","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2024. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=1tZbq88f27"}],"event":{"name":"CHI 2026: CHI Conference on Human Factors in Computing Systems","location":"Barcelona Spain","acronym":"CHI '26","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2026 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772318.3790922","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T09:54:25Z","timestamp":1780998865000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772318.3790922"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,13]]},"references-count":166,"alternative-id":["10.1145\/3772318.3790922","10.1145\/3772318"],"URL":"https:\/\/doi.org\/10.1145\/3772318.3790922","relation":{},"subject":[],"published":{"date-parts":[[2026,4,13]]},"assertion":[{"value":"2026-04-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}