{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T17:12:43Z","timestamp":1775841163756,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":97,"publisher":"ACM","funder":[{"name":"Shenzhen Science and Technology Program","award":["ZDSYS20220323112000001"],"award-info":[{"award-number":["ZDSYS20220323112000001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3774904.3792996","type":"proceedings-article","created":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T21:54:39Z","timestamp":1775771679000},"page":"8862-8873","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Egocentric Co-Pilot: Web-Native Smart-Glasses Agents for Assistive Egocentric AI"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-2018-0604","authenticated-orcid":false,"given":"Sicheng","family":"Yang","sequence":"first","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5725-5884","authenticated-orcid":false,"given":"Yukai","family":"Huang","sequence":"additional","affiliation":[{"name":"Independent researcher, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7726-4387","authenticated-orcid":false,"given":"Weitong","family":"Cai","sequence":"additional","affiliation":[{"name":"Queen Mary University of London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1825-655X","authenticated-orcid":false,"given":"Shitong","family":"Sun","sequence":"additional","affiliation":[{"name":"Queen Mary University of London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1082-8368","authenticated-orcid":false,"given":"Fengyi","family":"Fang","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2942-1699","authenticated-orcid":false,"given":"You","family":"He","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8566-1607","authenticated-orcid":false,"given":"Yiqiao","family":"Xie","sequence":"additional","affiliation":[{"name":"Imperial College London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3709-6216","authenticated-orcid":false,"given":"Jiankang","family":"Deng","sequence":"additional","affiliation":[{"name":"Imperial College London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0115-387X","authenticated-orcid":false,"given":"Hang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Independent researcher, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3381-6685","authenticated-orcid":false,"given":"Jifei","family":"Song","sequence":"additional","affiliation":[{"name":"University of Surrey, Guildford, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7911-7564","authenticated-orcid":false,"given":"Zhensong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Independent researcher, London, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2026,4,12]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","unstructured":"Michael Ahn Anthony Brohan Noah Brown et al. 2022. Do As I Can Not As I Say: Grounding Language in Robotic Affordances. CoRR Vol. abs\/2204.01691 (2022). arXiv:2204.01691 doi:10.48550\/ARXIV.2204.01691","DOI":"10.48550\/ARXIV.2204.01691"},{"key":"e_1_3_2_1_2_1","unstructured":"Anonymous. 2025. WearVox: An Egocentric Multichannel Voice Assistant Benchmark for Wearables. In Submitted to The Fourteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=QpaNErg7ug under review."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611164"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2503.07148"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","unstructured":"Shuai Bai Keqin Chen Xuejing Liu et al. 2025. Qwen2.5-VL Technical Report. CoRR Vol. abs\/2502.13923 (2025). arXiv:2502.13923 doi:10.48550\/ARXIV.2502.13923","DOI":"10.48550\/ARXIV.2502.13923"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19778-9_38"},{"key":"e_1_3_2_1_7_1","article-title":"Natural Language Processing and Neurosymbolic AI: The Role of Neural Networks with Knowledge-Guided Symbolic Approaches","volume":"2","author":"Barnes Emily","year":"2024","unstructured":"Emily Barnes and James Hutson. 2024. Natural Language Processing and Neurosymbolic AI: The Role of Neural Networks with Knowledge-Guided Symbolic Approaches. Journal of Artificial Intelligence and Robotics, Vol. 2, 1 (2024).","journal-title":"Journal of Artificial Intelligence and Robotics"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/S00521-024-09960-Z"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611163"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","unstructured":"Dingxin Cheng Mingda Li Jingyu Liu et al. 2024b. Enhancing Long Video Understanding via Hierarchical Event-Based Memory. CoRR Vol. abs\/2409.06299 (2024). arXiv:2409.06299 doi:10.48550\/ARXIV.2409.06299","DOI":"10.48550\/ARXIV.2409.06299"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","unstructured":"Zesen Cheng Sicong Leng Hang Zhang et al. 2024a. VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs. CoRR Vol. abs\/2406.07476 (2024). arXiv:2406.07476 doi:10.48550\/ARXIV.2406.07476","DOI":"10.48550\/ARXIV.2406.07476"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2502.20843"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2501.05435"},{"key":"e_1_3_2_1_14_1","volume-title":"Giovanni Maria Farinella, et al","author":"Damen Dima","year":"2018","unstructured":"Dima Damen, Hazel Doughty, Giovanni Maria Farinella, et al., 2018. Scaling Egocentric Vision: The EPIC-KITCHENS Dataset. CoRR, Vol. abs\/1804.02748 (2018). arXiv:1804.02748 http:\/\/arxiv.org\/abs\/1804.02748"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3701551.3704124"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561143"},{"key":"e_1_3_2_1_17_1","volume-title":"Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023","author":"Deng Xiang","year":"2023","unstructured":"Xiang Deng, Yu Gu, Boyuan Zheng, et al., 2023. Mind2Web: Towards a Generalist Agent for the Web. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01229"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3682076"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCE.2025.3526427"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00536"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680533.3697064"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","unstructured":"Pascale Fung Yoram Bachrach Asli Celikyilmaz et al. 2025. Embodied AI Agents: Modeling the World. CoRR Vol. abs\/2506.22355 (2025). arXiv:2506.22355 doi:10.48550\/ARXIV.2506.22355","DOI":"10.48550\/ARXIV.2506.22355"},{"key":"e_1_3_2_1_24_1","unstructured":"Google DeepMind. 2025. Gemini 2.5 Pro Preview Model Card. Technical Report. Google. https:\/\/storage.googleapis.com\/model-cards\/documents\/gemini-2.5-pro-preview.pdf Technical report (preview release)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.ACL-LONG.371"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1049\/cit2.70084"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwac035"},{"key":"e_1_3_2_1_29_1","first-page":"4830","article-title":"Why Vision Language Models Struggle with Visual Arithmetic? Towards Enhanced Chart and Geometry Understanding. In Findings of the Association for Computational Linguistics","author":"Huang Kung-Hsiang","year":"2025","unstructured":"Kung-Hsiang Huang, Can Qin, et al., 2025a. Why Vision Language Models Struggle with Visual Arithmetic? Towards Enhanced Chart and Geometry Understanding. In Findings of the Association for Computational Linguistics, ACL. ACL, 4830-4843.","journal-title":"ACL. ACL"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2016.53"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3749513"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","unstructured":"Aaron Hurst Adam Lerer Adam P. Goucher et al. 2024. GPT-4o System Card. CoRR Vol. abs\/2410.21276 (2024). arXiv:2410.21276 doi:10.48550\/ARXIV.2410.21276","DOI":"10.48550\/ARXIV.2410.21276"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","unstructured":"Minseo Kwon Yaesol Kim et al. 2024. Fast and Accurate Task Planning using Neuro-Symbolic Language Models and Multi-level Goal Decomposition. CoRR Vol. abs\/2409.19250 (2024). arXiv:2409.19250 doi:10.48550\/ARXIV.2409.19250","DOI":"10.48550\/ARXIV.2409.19250"},{"key":"e_1_3_2_1_34_1","article-title":"LLaVA-OneVision: Easy Visual Task","volume":"2025","author":"Li Bo","year":"2025","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, et al., 2025d. LLaVA-OneVision: Easy Visual Task Transfer. Trans. Mach. Learn. Res., Vol. 2025 (2025).","journal-title":"Transfer. Trans. Mach. Learn. Res."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2503.15275"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","unstructured":"Xinhao Li Yi Wang Jiashuo Yu et al. 2025b. VideoChat-Flash: Hierarchical Compression for Long-Context Video Modeling. CoRR Vol. abs\/2501.00574 (2025). arXiv:2501.00574 doi:10.48550\/ARXIV.2501.00574","DOI":"10.48550\/ARXIV.2501.00574"},{"key":"e_1_3_2_1_37_1","volume-title":"MDSD: Multi-Turn Diverse Synthetic Dialog Generation for Domain Specific Incomplete Requests Understanding. SSRN Electronic Journal","author":"Li Xi","year":"2025","unstructured":"Xi Li, Xiaoxu Wu, Lijuan Xiao, et al., 2025c. MDSD: Multi-Turn Diverse Synthetic Dialog Generation for Domain Specific Incomplete Requests Understanding. SSRN Electronic Journal (2025), 1-9."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.EMNLP-MAIN.342"},{"key":"e_1_3_2_1_39_1","unstructured":"LiveKit Contributors. 2025. LiveKit: Open-source WebRTC and realtime AI infrastructure. https:\/\/github.com\/livekit\/livekit."},{"key":"e_1_3_2_1_40_1","volume-title":"The Thirteenth International Conference on Learning Representations, ICLR","author":"Lu Yaxi","year":"2025","unstructured":"Yaxi Lu, Shenzhi Yang, Cheng Qian, et al., 2025. Proactive Agent: Shifting LLM Agents from Reactive Responses to Active Assistance. In The Thirteenth International Conference on Learning Representations, ICLR, Singapore. OpenReview.net."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00843"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.FINDINGS-EMNLP.360"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-62849-8_35"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2504.04550"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2502.03671"},{"key":"e_1_3_2_1_46_1","volume-title":"Gorilla: Large Language Model Connected with Massive APIs. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024","author":"Patil Shishir G.","year":"2024","unstructured":"Shishir G. Patil, Tianjun Zhang, Xin Wang, et al., 2024. Gorilla: Large Language Model Connected with Massive APIs. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","unstructured":"Taiying Peng Jiacheng Hua Miao Liu et al. 2025. In the Eye of MLLM: Benchmarking Egocentric Video Intent Understanding with Gaze-Guided Prompting. CoRR Vol. abs\/2509.07447 (2025). arXiv:2509.07447 doi:10.48550\/ARXIV.2509.07447","DOI":"10.48550\/ARXIV.2509.07447"},{"key":"e_1_3_2_1_48_1","volume-title":"HD-EPIC: A Highly-Detailed Egocentric Video Dataset. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. Computer Vision Foundation \/ IEEE, 23901-23913","author":"Perrett Toby","year":"2025","unstructured":"Toby Perrett, Ahmad Darkhalil, Saptarshi Sinha, et al., 2025. HD-EPIC: A Highly-Detailed Egocentric Video Dataset. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. Computer Vision Foundation \/ IEEE, 23901-23913."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","unstructured":"Zhangyang Qi Zhixiong Zhang Ye Fang et al. 2025. GPT4Scene: Understand 3D Scenes from Videos with Vision-Language Models. CoRR Vol. abs\/2501.01428 (2025). arXiv:2501.01428 doi:10.48550\/ARXIV.2501.01428","DOI":"10.48550\/ARXIV.2501.01428"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.ACL-LONG.61"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","unstructured":"Ram Ramrakhya Matthew Chang Xavier Puig et al. 2025. Grounding Multimodal LLMs to Embodied Agents that Ask for Help with Reinforcement Learning. CoRR Vol. abs\/2504.00907 (2025). arXiv:2504.00907 doi:10.48550\/ARXIV.2504.00907","DOI":"10.48550\/ARXIV.2504.00907"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","unstructured":"Machel Reid Nikolay Savinov Denis Teplyashin et al. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. CoRR Vol. abs\/2403.05530 (2024). arXiv:2403.05530 doi:10.48550\/ARXIV.2403.05530","DOI":"10.48550\/ARXIV.2403.05530"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3118208"},{"key":"e_1_3_2_1_54_1","volume-title":"Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Schick Timo","year":"2023","unstructured":"Timo Schick, Jane Dwivedi-Yu, Roberto Dess\u00ec, et al., 2023. Toolformer: Language Models Can Teach Themselves to Use Tools. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems, NeurIPS, New Orleans, LA, USA."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.1446"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2025.FINDINGS-NAACL.410"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00641"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2308.13561"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIPRO.2015.7160422"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","unstructured":"Shulin Tian Ruiqi Wang Hongming Guo et al. 2025a. Ego-R1: Chain-of-Tool-Thought for Ultra-Long Egocentric Video Reasoning. CoRR Vol. abs\/2506.13654 (2025). arXiv:2506.13654 doi:10.48550\/ARXIV.2506.13654","DOI":"10.48550\/ARXIV.2506.13654"},{"key":"e_1_3_2_1_62_1","first-page":"13682","volume-title":"ACL","author":"Tian Shulin","year":"2025","unstructured":"Shulin Tian, Ziniu Zhang, Liangyu Chen, et al., 2025b. MMInA: Benchmarking Multihop Multimodal Internet Agents. In Findings of the Association for Computational Linguistics, ACL 2025. ACL, 13682-13697."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-020-15871-z"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.59287\/icaens.1127"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","unstructured":"Zishen Wan Che-Kai Liu Hanchen Yang et al. 2024. Towards Cognitive AI Systems: a Survey and Prospective on Neuro-Symbolic AI. CoRR Vol. abs\/2401.01040 (2024). arXiv:2401.01040 doi:10.48550\/ARXIV.2401.01040","DOI":"10.48550\/ARXIV.2401.01040"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2505.11533"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1007\/S11704-024-40231-1"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2312.05269"},{"key":"e_1_3_2_1_69_1","volume-title":"Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. In Advances in Neural Information Processing Systems 35: Annual Conference on NeurIPS.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, et al., 2022. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. In Advances in Neural Information Processing Systems 35: Annual Conference on NeurIPS."},{"key":"e_1_3_2_1_70_1","volume-title":"LLM-powered Autonomous Agents. lilianweng.github.io (Jun","author":"Weng Lilian","year":"2023","unstructured":"Lilian Weng. 2023. LLM-powered Autonomous Agents. lilianweng.github.io (Jun 2023). https:\/\/lilianweng.github.io\/posts\/2023-06-23-agent\/"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.FINDINGS-EMNLP.145"},{"key":"e_1_3_2_1_72_1","unstructured":"xAI. 2025. Grok 3 Beta \u2014 The Age of Reasoning Agents. https:\/\/x.ai\/blog\/grok-3."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2407.08516"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02690"},{"key":"e_1_3_2_1_75_1","unstructured":"Sicheng Yang Yukai Huang Weitong Cai et al. 2025a. Plug-and-Play Clarifier: A Zero-Shot Multimodal Framework for Egocentric Intent Disambiguation. arXiv preprint arXiv:2511.08971 (2025)."},{"key":"e_1_3_2_1_76_1","unstructured":"Sicheng Yang Yukai Huang Shitong Sun et al. 2026. Optimizing Multimodal LLMs for Egocentric Video Understanding: A Solution for the HD-EPIC VQA Challenge. arXiv:2601.10228 [cs.CV] https:\/\/arxiv.org\/abs\/2601.10228"},{"key":"e_1_3_2_1_77_1","volume-title":"ReAct: Synergizing Reasoning and Acting in Language Models. In The Eleventh International Conference on Learning Representations, ICLR 2023","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Jeffrey Zhao, Dian Yu, et al., 2023. ReAct: Synergizing Reasoning and Acting in Language Models. In The Eleventh International Conference on Learning Representations, ICLR 2023, Kigali, Rwanda, May 1-5, 2023. OpenReview.net."},{"key":"e_1_3_2_1_78_1","volume-title":"MMEgo: Towards Building Egocentric Multimodal LLMs for Video QA. In The Thirteenth International Conference on Learning Representations, ICLR","author":"Ye Hanrong","year":"2025","unstructured":"Hanrong Ye, Haotian Zhang, Erik A. Daxberger, et al., 2025. MMEgo: Towards Building Egocentric Multimodal LLMs for Video QA. In The Thirteenth International Conference on Learning Representations, ICLR, Singapore. OpenReview.net."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","unstructured":"Asaf Yehudai Lilach Eden et al. 2025. Survey on Evaluation of LLM-based Agents. CoRR Vol. abs\/2503.16416 (2025). arXiv:2503.16416 doi:10.48550\/ARXIV.2503.16416","DOI":"10.48550\/ARXIV.2503.16416"},{"key":"e_1_3_2_1_80_1","volume-title":"Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Yi Kexin","year":"2018","unstructured":"Kexin Yi, Jiajun Wu, Chuang Gan, et al., 2018. Neural-Symbolic VQA: Disentangling Reasoning from Vision and Language Understanding. In Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems, NeurIPS, Montr\u00e9al, Canada. 1039-1050."},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.EMNLP-MAIN.505"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1016\/J.NEUNET.2023.06.028"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","unstructured":"Zihao Yue Zhenru Lin Yifan Song et al. 2025. MiMo-VL Technical Report. CoRR Vol. abs\/2506.03569 (2025). arXiv:2506.03569 doi:10.48550\/ARXIV.2506.03569","DOI":"10.48550\/ARXIV.2506.03569"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1145\/3675094.3678992"},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713190"},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","unstructured":"Haoyu Zhang Qiaohui Chu Meng Liu et al. 2025b. Exo2Ego: Exocentric Knowledge Guided MLLM for Egocentric Video Understanding. CoRR Vol. abs\/2503.09143 (2025). arXiv:2503.09143 doi:10.48550\/ARXIV.2503.09143","DOI":"10.48550\/ARXIV.2503.09143"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","unstructured":"Hangtao Zhang Chenyu Zhu Xianlong Wang et al. 2024 e. BadRobot: Jailbreaking LLM-based Embodied AI in the Physical World. CoRR Vol. abs\/2407.20242 (2024). arXiv:2407.20242 doi:10.48550\/ARXIV.2407.20242","DOI":"10.48550\/ARXIV.2407.20242"},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"publisher","unstructured":"Peiyuan Zhang Kaichen Zhang Bo Li et al. 2024d. Long Context Transfer from Language to Vision. CoRR Vol. abs\/2406.16852 (2024). arXiv:2406.16852 doi:10.48550\/ARXIV.2406.16852","DOI":"10.48550\/ARXIV.2406.16852"},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.ACL-LONG.578"},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.FINDINGS-EMNLP.636"},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","unstructured":"Xuan Zhang Yongliang Shen Zhe Zheng et al. 2025d. AskToAct: Enhancing LLMs Tool Use via Self-Correcting Clarification. CoRR Vol. abs\/2503.01940 (2025). arXiv:2503.01940 doi:10.48550\/ARXIV.2503.01940","DOI":"10.48550\/ARXIV.2503.01940"},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2506.05904"},{"key":"e_1_3_2_1_93_1","volume-title":"Forty-first International Conference on Machine Learning, ICML 2024","author":"Zheng Boyuan","year":"2024","unstructured":"Boyuan Zheng, Boyu Gou, Jihyung Kil, Huan Sun, and Yu Su. 2024. GPT-4V(ision) is a Generalist Web Agent, if Grounded. In Forty-first International Conference on Machine Learning, ICML 2024, Vienna, Austria, July 21-27, 2024. OpenReview.net."},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","unstructured":"Zhuo Zhi Qiangqiang Wu et al. 2025. VideoAgent2: Enhancing the LLM-Based Agent System for Long-Form Video Understanding by Uncertainty-Aware CoT. CoRR Vol. abs\/2504.04471 (2025). arXiv:2504.04471 doi:10.48550\/ARXIV.2504.04471","DOI":"10.48550\/ARXIV.2504.04471"},{"key":"e_1_3_2_1_95_1","volume-title":"WebArena: A Realistic Web Environment for Building Autonomous Agents. In The Twelfth International Conference on Learning Representations, ICLR","author":"Zhou Shuyan","year":"2024","unstructured":"Shuyan Zhou, Frank F. Xu, Hao Zhu, et al., 2024. WebArena: A Realistic Web Environment for Building Autonomous Agents. In The Twelfth International Conference on Learning Representations, ICLR 2024. OpenReview.net."},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2312.15719"},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2411.14466"}],"event":{"name":"WWW '26: The ACM Web Conference 2026","location":"Dubai United Arab Emirates","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2026"],"original-title":[],"deposited":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T16:27:35Z","timestamp":1775838455000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774904.3792996"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,12]]},"references-count":97,"alternative-id":["10.1145\/3774904.3792996","10.1145\/3774904"],"URL":"https:\/\/doi.org\/10.1145\/3774904.3792996","relation":{},"subject":[],"published":{"date-parts":[[2026,4,12]]},"assertion":[{"value":"2026-04-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}