{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,10]],"date-time":"2026-05-10T08:07:23Z","timestamp":1778400443325,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":132,"publisher":"ACM","funder":[{"name":"General Research Funds from the Hong Kong Research Grants Council","award":["project no. PolyU 15207322, 15200023, 15206024, 15224524"],"award-info":[{"award-number":["project no. PolyU 15207322, 15200023, 15206024, 15224524"]}]},{"name":"Internal Research Funds from The Hong Kong Polytechnic University","award":["project no. P0042693, P0048625, P0051361, P0052406, P0052986"],"award-info":[{"award-number":["project no. P0042693, P0048625, P0051361, P0052406, P0052986"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3736555","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T20:52:41Z","timestamp":1754254361000},"page":"6140-6150","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":24,"title":["A Survey of WebAgents: Towards Next-Generation AI Agents for Web Automation with Large Foundation Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6903-8996","authenticated-orcid":false,"given":"Liangbo","family":"Ning","sequence":"first","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1483-5810","authenticated-orcid":false,"given":"Ziran","family":"Liang","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0746-5015","authenticated-orcid":false,"given":"Zhuohang","family":"Jiang","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7129-8586","authenticated-orcid":false,"given":"Haohao","family":"Qu","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2945-1107","authenticated-orcid":false,"given":"Yujuan","family":"Ding","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4049-1233","authenticated-orcid":false,"given":"Wenqi","family":"Fan","sequence":"additional","affiliation":[{"name":"Department of Computing, and Department of Management and Marketing, The Hong Kong Polytechnic University, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5706-5177","authenticated-orcid":false,"given":"Xiao-yong","family":"Wei","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1439-2514","authenticated-orcid":false,"given":"Shanru","family":"Lin","sequence":"additional","affiliation":[{"name":"City University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3555-3495","authenticated-orcid":false,"given":"Hui","family":"Liu","sequence":"additional","affiliation":[{"name":"Michigan State University, Michigan, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3491-5968","authenticated-orcid":false,"given":"Philip S.","family":"Yu","sequence":"additional","affiliation":[{"name":"University of Illinois at Chicago, Chicago, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3370-471X","authenticated-orcid":false,"given":"Qing","family":"Li","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong, Hong Kong"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2658"},{"key":"e_1_3_2_1_2_1","volume-title":"NeurIPS 2024 Workshop on Open-World Agents.","author":"Abuelsaad Tamer","unstructured":"Tamer Abuelsaad, Deepak Akkil, Prasenjit Dey, Ashish Jagmohan, Aditya Vempaty, and Ravi Kokku. [n.d.]. Agent-E: From Autonomous Web Navigation to Foundational Design Principles in Agentic Systems. In NeurIPS 2024 Workshop on Open-World Agents."},{"key":"e_1_3_2_1_3_1","volume-title":"The Thirteenth International Conference on Learning Representations.","author":"Agashe Saaket","unstructured":"Saaket Agashe, Jiuzhou Han, Shuyu Gan, Jiachen Yang, Ang Li, and Xin Eric Wang. [n.d.]. Agent S: An Open Agentic Framework that Uses Computers Like a Human. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3690653"},{"key":"e_1_3_2_1_5_1","unstructured":"Gilles Baechler Srinivas Sunkara Maria Wang Fedir Zubach Hassan Mansoor Vincent Etter Victor C\u0103rbune Jason Lin Jindong Chen and Abhanshu Sharma. 2024. ScreenAI: A vision-language model for ui and infographics understanding. arXiv preprint arXiv:2402.04615(2024)."},{"key":"e_1_3_2_1_6_1","first-page":"12461","article-title":"Digirl: Training in-the-wild device-control agents with autonomous reinforcement learning","volume":"37","author":"Bai Hao","year":"2025","unstructured":"Hao Bai, Yifei Zhou, Jiayi Pan, Mert Cemri, Alane Suhr, Sergey Levine, and Aviral Kumar. 2025. Digirl: Training in-the-wild device-control agents with autonomous reinforcement learning. Advances in Neural Information Processing Systems, Vol. 37 (2025), 12461-12495.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_7_1","volume-title":"Lexi: Self-supervised learning of the ui language. arXiv preprint arXiv:2301.10165(2023).","author":"Banerjee Pratyay","year":"2023","unstructured":"Pratyay Banerjee, Shweti Mahajan, Kushal Arora, Chitta Baral, and Oriana Riva. 2023. Lexi: Self-supervised learning of the ui language. arXiv preprint arXiv:2301.10165(2023)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Andrea Burns Kate Saenko and Bryan A Plummer. 2024. Tell Me What's Next: Textual Foresight for Generic UI Representations. arXiv preprint arXiv:2406.07822(2024).","DOI":"10.18653\/v1\/2024.findings-acl.273"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Yi Cai and Brenda J Cude. 2016. Online shopping. Handbook of consumer finance research(2016) 339-355.","DOI":"10.1007\/978-3-319-28887-1_28"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2220365.2220367"},{"key":"e_1_3_2_1_11_1","volume-title":"The Thirteenth International Conference on Learning Representations.","author":"Chae Hyungjoo","year":"2024","unstructured":"Hyungjoo Chae, Namyoung Kim, Kai Tzu-iunn Ong, Minju Gwak, Gwanwoo Song, Jihoon Kim, Sunghwan Kim, Dongha Lee, and Jinyoung Yeo. 2024. Web agents with world models: Learning and leveraging environment dynamics in web navigation. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Pei Chen Boran Han and Shuai Zhang. 2024a. CoMM: Collaborative multi-agent multi-reasoning-path prompting for complex problem solving. arXiv preprint arXiv:2404.17729(2024).","DOI":"10.18653\/v1\/2024.findings-naacl.112"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27878"},{"key":"e_1_3_2_1_14_1","volume-title":"EDGE: Enhanced grounded gui understanding with enriched multi-granularity synthetic data. arXiv preprint arXiv:2410.19461(2024).","author":"Chen Xuetian","year":"2024","unstructured":"Xuetian Chen, Hangcheng Li, Jiaqing Liang, Sihang Jiang, and Deqing Yang. 2024b. EDGE: Enhanced grounded gui understanding with enriched multi-granularity synthetic data. arXiv preprint arXiv:2410.19461(2024)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.505"},{"key":"e_1_3_2_1_16_1","volume-title":"Caap: Context-aware action planning prompting to solve computer tasks with front-end ui only. arXiv preprint arXiv:2406.06947(2024).","author":"Cho Junhee","year":"2024","unstructured":"Junhee Cho, Jihoon Kim, Daseul Bae, Jinho Choo, Youngjune Gwon, and Yeong-Dae Kwon. 2024. Caap: Context-aware action planning prompting to solve computer tasks with front-end ui only. arXiv preprint arXiv:2406.06947(2024)."},{"key":"e_1_3_2_1_17_1","unstructured":"Colin B Clement Matthew Bierbaum Kevin P O'Keeffe and Alexander A Alemi. 2019. On the use of arxiv as a dataset. arXiv preprint arXiv:1905.00075(2019)."},{"key":"e_1_3_2_1_18_1","first-page":"28091","article-title":"Mind2web: Towards a generalist agent for the web","volume":"36","author":"Deng Xiang","year":"2023","unstructured":"Xiang Deng, Yu Gu, Boyuan Zheng, Shijie Chen, Sam Stevens, Boshi Wang, Huan Sun, and Yu Su. 2023. Mind2web: Towards a generalist agent for the web. Advances in Neural Information Processing Systems, Vol. 36 (2023), 28091-28114.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","unstructured":"Zehang Deng Yongjian Guo Changzhou Han Wanlun Ma Junwu Xiong Sheng Wen and Yang Xiang. 2024. Ai agents under threat: A survey of key security challenges and future pathways. Comput. Surveys(2024)."},{"key":"e_1_3_2_1_20_1","unstructured":"Nicolai Dorka Janusz Marecki and Ammar Anwar. 2024. Training a vision language model as smartphone assistant. arXiv preprint arXiv:2404.08755(2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671470"},{"key":"e_1_3_2_1_22_1","unstructured":"Wenqi Fan Yi Zhou Shijie Wang Yuyao Yan Hui Liu Qian Zhao Le Song and Qing Li. 2025. Computational Protein Science in the Era of Large Language Models (LLMs). arXiv preprint arXiv:2501.10282(2025)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3661357"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Moghis Fereidouni Adib Mosharrof and AB Siddique. 2024. Grounded Language Agent for Product Search via Intelligent Web Interactions. arXiv preprint arXiv:2404.10887(2024).","DOI":"10.18653\/v1\/2024.customnlp4u-1.7"},{"key":"e_1_3_2_1_25_1","volume-title":"Shixiang Shane Gu, and Izzeddin Gur","author":"Furuta Hiroki","year":"2023","unstructured":"Hiroki Furuta, Kuang-Huei Lee, Ofir Nachum, Yutaka Matsuo, Aleksandra Faust, Shixiang Shane Gu, and Izzeddin Gur. 2023a. Multimodal web navigation with instruction-finetuned foundation models. arXiv preprint arXiv:2305.11854(2023)."},{"key":"e_1_3_2_1_26_1","unstructured":"Hiroki Furuta Yutaka Matsuo Aleksandra Faust and Izzeddin Gur. 2023b. Exposing limitations of language model agents in sequential-task compositions on the web. arXiv preprint arXiv:2311.18751(2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"Assistgui: Task-oriented desktop graphical user interface automation. arXiv preprint arXiv:2312.13108(2023).","author":"Gao Difei","year":"2023","unstructured":"Difei Gao, Lei Ji, Zechen Bai, Mingyu Ouyang, Peiran Li, Dongxing Mao, Qinchen Wu, Weichen Zhang, Peiyi Wang, Xiangwu Guo, et al., 2023. Assistgui: Task-oriented desktop graphical user interface automation. arXiv preprint arXiv:2312.13108(2023)."},{"key":"e_1_3_2_1_28_1","unstructured":"Boyu Gou Ruohan Wang Boyuan Zheng Yanan Xie Cheng Chang Yiheng Shu Huan Sun and Yu Su. 2024. Navigating the digital world as humans do: Universal visual grounding for GUI agents. arXiv preprint arXiv:2410.05243(2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3616855.3635739"},{"key":"e_1_3_2_1_30_1","unstructured":"Yu Gu Boyuan Zheng Boyu Gou Kai Zhang Cheng Chang Sanjari Srivastava Yanan Xie Peng Qi Huan Sun and Yu Su. 2024. Is your llm secretly a world model of the internet? model-based planning for web agents. arXiv preprint arXiv:2411.06559(2024)."},{"key":"e_1_3_2_1_31_1","unstructured":"Yanchu Guan Dong Wang Zhixuan Chu Shiyu Wang Feiyue Ni Ruihua Song Longfei Li Jinjie Gu and Chenyi Zhuang. 2023. Intelligent Virtual Assistants with LLM-based Process Automation. CoRR(2023)."},{"key":"e_1_3_2_1_32_1","first-page":"106190","article-title":"RedCode: Risky Code Execution and Generation Benchmark for Code Agents","volume":"37","author":"Guo Chengquan","year":"2025","unstructured":"Chengquan Guo, Xun Liu, Chulin Xie, Andy Zhou, Yi Zeng, Zinan Lin, Dawn Song, and Bo Li. 2025. RedCode: Risky Code Execution and Generation Benchmark for Code Agents. Advances in Neural Information Processing Systems, Vol. 37 (2025), 106190-106236.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_33_1","first-page":"134721","article-title":"Can llms solve molecule puzzles? a multimodal benchmark for molecular structure elucidation","volume":"37","author":"Guo Kehan","year":"2024","unstructured":"Kehan Guo, Bozhao Nan, Yujun Zhou, Taicheng Guo, Zhichun Guo, Mihir Surve, Zhenwen Liang, Nitesh Chawla, Olaf Wiest, and Xiangliang Zhang. 2024. Can llms solve molecule puzzles? a multimodal benchmark for molecular structure elucidation. Advances in Neural Information Processing Systems, Vol. 37 (2024), 134721-134746.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_34_1","volume-title":"Mustafa Safdari, Yutaka Matsuo, Douglas Eck, and Aleksandra Faust.","author":"Gur Izzeddin","year":"2023","unstructured":"Izzeddin Gur, Hiroki Furuta, Austin Huang, Mustafa Safdari, Yutaka Matsuo, Douglas Eck, and Aleksandra Faust. 2023. A real-world webagent with planning, long context understanding, and program synthesis. arXiv preprint arXiv:2307.12856(2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Mustafa Safdari, Yutaka Matsuo, Douglas Eck, and Aleksandra Faust.","author":"Gur Izzeddin","year":"2024","unstructured":"Izzeddin Gur, Hiroki Furuta, Austin V Huang, Mustafa Safdari, Yutaka Matsuo, Douglas Eck, and Aleksandra Faust. 2024. A Real-World WebAgent with Planning, Long Context Understanding, and Program Synthesis. In ICLR."},{"key":"e_1_3_2_1_36_1","unstructured":"Bernal Jim\u00e9nez Guti\u00e9rrez Yiheng Shu Weijian Qi Sizhe Zhou and Yu Su. 2025. From RAG to Memory: Non-Parametric Continual Learning for Large Language Models. arXiv preprint arXiv:2502.14802(2025)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.371"},{"key":"e_1_3_2_1_38_1","volume-title":"Clickagent: Enhancing ui location capabilities of autonomous agents. arXiv preprint arXiv:2410.11872(2024).","author":"Hoscilowicz Jakub","year":"2024","unstructured":"Jakub Hoscilowicz, Bartosz Maj, Bartosz Kozakiewicz, Oleksii Tymoshchuk, and Artur Janicki. 2024. Clickagent: Enhancing ui location capabilities of autonomous agents. arXiv preprint arXiv:2410.11872(2024)."},{"key":"e_1_3_2_1_39_1","unstructured":"Xueyu Hu Tao Xiong Biao Yi Zishu Wei Ruixuan Xiao Yurun Chen Jiasheng Ye Meiling Tao Xiangxin Zhou Ziyu Zhao et al. 2024. Os agents: A survey on mllm-based agents for general computing devices use."},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Machine Learning. PMLR, 9466-9482","author":"Humphreys Peter C","year":"2022","unstructured":"Peter C Humphreys, David Raposo, Tobias Pohlen, Gregory Thornton, Rachita Chhaparia, Alistair Muldal, Josh Abramson, Petko Georgiev, Adam Santoro, and Timothy Lillicrap. 2022. A data-driven approach for learning to control computers. In International Conference on Machine Learning. PMLR, 9466-9482."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2015.07.004"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-demos.8"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/953460.953505"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01369"},{"key":"e_1_3_2_1_45_1","first-page":"39648","article-title":"Language models can solve computer tasks","volume":"36","author":"Kim Geunwoo","year":"2023","unstructured":"Geunwoo Kim, Pierre Baldi, and Stephen McAleer. 2023. Language models can solve computer tasks. Advances in Neural Information Processing Systems, Vol. 36 (2023), 39648-39677.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.964"},{"key":"e_1_3_2_1_47_1","volume-title":"Information retrieval on the web. ACM computing surveys (CSUR)","author":"Kobayashi Mei","year":"2000","unstructured":"Mei Kobayashi and Koichi Takeda. 2000. Information retrieval on the web. ACM computing surveys (CSUR), Vol. 32, 2 (2000), 144-173."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.50"},{"key":"e_1_3_2_1_49_1","unstructured":"Jing Yu Koh Stephen McAleer Daniel Fried and Ruslan Salakhutdinov. 2024b. Tree search for language model agents. arXiv preprint arXiv:2407.01476(2024)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1002\/widm.1218"},{"key":"e_1_3_2_1_51_1","volume-title":"Elaine Chang, Vaughn Robinson, Sean Hendryx, Shuyan Zhou, Matt Fredrikson, et al.","author":"Kumar Priyanshu","year":"2024","unstructured":"Priyanshu Kumar, Elaine Lau, Saranya Vijayakumar, Tu Trinh, Scale Red Team, Elaine Chang, Vaughn Robinson, Sean Hendryx, Shuyan Zhou, Matt Fredrikson, et al., 2024. Refusal-trained llms are easily jailbroken as browser agents. arXiv preprint arXiv:2410.13886(2024)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671620"},{"key":"e_1_3_2_1_53_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Lee Kenton","year":"2023","unstructured":"Kenton Lee, Mandar Joshi, Iulia Raluca Turc, Hexiang Hu, Fangyu Liu, Julian Martin Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, and Kristina Toutanova. 2023. Pix2struct: Screenshot parsing as pretraining for visual language understanding. In International Conference on Machine Learning. PMLR, 18893-18912."},{"key":"e_1_3_2_1_54_1","volume-title":"Spotlight: Mobile ui understanding using vision-language models with a focus. arXiv preprint arXiv:2209.14927(2022).","author":"Li Gang","year":"2022","unstructured":"Gang Li and Yang Li. 2022. Spotlight: Mobile ui understanding using vision-language models with a focus. arXiv preprint arXiv:2209.14927(2022)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.753"},{"key":"e_1_3_2_1_56_1","first-page":"92130","article-title":"On the effects of data scale on ui control agents","volume":"37","author":"Li Wei","year":"2025","unstructured":"Wei Li, William E Bishop, Alice Li, Christopher Rawles, Folawiyo Campbell-Ajala, Divya Tyamagundlu, and Oriana Riva. 2025. On the effects of data scale on ui control agents. Advances in Neural Information Processing Systems, Vol. 37 (2025), 92130-92154.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_57_1","unstructured":"Wei Li Fu-Lin Hsu Will Bishop Folawiyo Campbell-Ajala Max Lin and Oriana Riva. 2023a. UINav: A practical approach to train on-device automation agents. arXiv preprint arXiv:2312.10170(2023)."},{"key":"e_1_3_2_1_58_1","unstructured":"Yuanchun Li Hao Wen Weijun Wang Xiangyu Li Yizhen Yuan Guohong Liu Jiacheng Liu Wenxing Xu Xiang Wang Yi Sun et al. 2024a. Personal llm agents: Insights and survey about the capability efficiency and security. arXiv preprint arXiv:2401.05459(2024)."},{"key":"e_1_3_2_1_59_1","volume-title":"Mini-gemini: Mining the potential of multi-modality vision language models. arXiv preprint arXiv:2403.18814(2024).","author":"Li Yanwei","year":"2024","unstructured":"Yanwei Li, Yuechen Zhang, Chengyao Wang, Zhisheng Zhong, Yixin Chen, Ruihang Chu, Shaoteng Liu, and Jiaya Jia. 2024c. Mini-gemini: Mining the potential of multi-modality vision language models. arXiv preprint arXiv:2403.18814(2024)."},{"key":"e_1_3_2_1_60_1","volume-title":"Jeff Nichols, Yinfei Yang, and Zhe Gan.","author":"Li Zhangheng","year":"2024","unstructured":"Zhangheng Li, Keen You, Haotian Zhang, Di Feng, Harsh Agrawal, Xiujun Li, Mohana Prasad Sathya Moorthy, Jeff Nichols, Yinfei Yang, and Zhe Gan. 2024b. Ferret-UI 2: Mastering universal user interface understanding across platforms. arXiv preprint arXiv:2410.18967(2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"Eia: Environmental injection attack on generalist web agents for privacy leakage. arXiv preprint arXiv:2409.11295(2024).","author":"Liao Zeyi","year":"2024","unstructured":"Zeyi Liao, Lingbo Mo, Chejian Xu, Mintong Kang, Jiawei Zhang, Chaowei Xiao, Yuan Tian, Bo Li, and Huan Sun. 2024. Eia: Environmental injection attack on generalist web agents for privacy leakage. arXiv preprint arXiv:2409.11295(2024)."},{"key":"e_1_3_2_1_62_1","unstructured":"Kevin Qinghong Lin Linjie Li Difei Gao Zhengyuan Yang Shiwei Wu Zechen Bai Weixian Lei Lijuan Wang and Mike Zheng Shou. 2024. ShowUI: One vision-language-action model for gui visual agent. arXiv preprint arXiv:2411.17465(2024)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3546872"},{"key":"e_1_3_2_1_64_1","unstructured":"Junpeng Liu Tianyue Ou Yifan Song Yuxiao Qu Wai Lam Chenyan Xiong Wenhu Chen Graham Neubig and Xiang Yue. 2024b. Harnessing webpage UIs for text-rich visual understanding. arXiv preprint arXiv:2410.13824(2024)."},{"key":"e_1_3_2_1_65_1","volume-title":"Jiadai Sun, Jiaqi Wang, et al.","author":"Liu Xiao","year":"2024","unstructured":"Xiao Liu, Bo Qin, Dongzhu Liang, Guang Dong, Hanyu Lai, Hanchen Zhang, Hanlin Zhao, Iat Long Iong, Jiadai Sun, Jiaqi Wang, et al., 2024c. Autoglm: Autonomous foundation agents for GUIs. arXiv preprint arXiv:2411.00820(2024)."},{"key":"e_1_3_2_1_66_1","volume-title":"Moleculargpt: Open large language model (llm) for few-shot molecular property prediction. arXiv preprint arXiv:2406.12950(2024).","author":"Liu Yuyan","year":"2024","unstructured":"Yuyan Liu, Sirui Ding, Sheng Zhou, Wenqi Fan, and Qiaoyu Tan. 2024a. Moleculargpt: Open large language model (llm) for few-shot molecular property prediction. arXiv preprint arXiv:2406.12950(2024)."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.685"},{"key":"e_1_3_2_1_68_1","unstructured":"Yadong Lu Jianwei Yang Yelong Shen and Ahmed Awadallah. 2024. Omniparser for pure vision based gui agent. arXiv preprint arXiv:2408.00203(2024)."},{"key":"e_1_3_2_1_69_1","volume-title":"LASER: LLM Agent with State-Space Exploration for Web Navigation. In NeurIPS 2023 Foundation Models for Decision Making Workshop.","author":"Ma Kaixin","unstructured":"Kaixin Ma, Hongming Zhang, Hongwei Wang, Xiaoman Pan, and Dong Yu. [n.d.]. LASER: LLM Agent with State-Space Exploration for Web Navigation. In NeurIPS 2023 Foundation Models for Decision Making Workshop."},{"key":"e_1_3_2_1_70_1","first-page":"27529","article-title":"GenRL: Multimodal-foundation world models for generalization in embodied agents","volume":"37","author":"Mazzaglia Pietro","year":"2025","unstructured":"Pietro Mazzaglia, Tim Verbelen, Bart Dhoedt, Aaron C Courville, and Sai Rajeswar Mudumba. 2025. GenRL: Multimodal-foundation world models for generalization in embodied agents. Advances in Neural Information Processing Systems, Vol. 37 (2025), 27529-27555.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_71_1","volume-title":"Aios: Llm agent operating system. arXiv preprint arXiv:2403.16971(2024).","author":"Mei Kai","year":"2024","unstructured":"Kai Mei, Xi Zhu, Wujiang Xu, Wenyue Hua, Mingyu Jin, Zelong Li, Shuyuan Xu, Ruosong Ye, Yingqiang Ge, and Yongfeng Zhang. 2024. Aios: Llm agent operating system. arXiv preprint arXiv:2403.16971(2024)."},{"key":"e_1_3_2_1_72_1","unstructured":"Manisha Mukherjee Sungchul Kim Xiang Chen Dan Luo Tong Yu and Tung Mai. 2025. From Documents to Dialogue: Building KG-RAG Enhanced AI Assistants. arXiv:2502.15237 [cs.IR] https:\/\/arxiv.org\/abs\/2502.15237"},{"key":"e_1_3_2_1_73_1","volume-title":"Manning","author":"Murty Shikhar","year":"2024","unstructured":"Shikhar Murty, Hao Zhu, Dzmitry Bahdanau, and Christopher D. Manning. 2024. NNetNav: Unsupervised Learning of Browser Agents Through Environment Interaction in the Wild. arXiv preprint arXiv:2410.02907(2024)."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640457.3688061"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2024\/711"},{"key":"e_1_3_2_1_76_1","first-page":"91618","article-title":"Synatra: Turning indirect knowledge into direct demonstrations for digital agents at scale","volume":"37","author":"Ou Tianyue","year":"2025","unstructured":"Tianyue Ou, Frank F Xu, Aman Madaan, Jiarui Liu, Robert Lo, Abishek Sridhar, Sudipta Sengupta, Dan Roth, Graham Neubig, and Shuyan Zhou. 2025. Synatra: Turning indirect knowledge into direct demonstrations for digital agents at scale. Advances in Neural Information Processing Systems, Vol. 37 (2025), 91618-91652.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_77_1","unstructured":"Ajay Patel Markus Hofmarcher Claudiu Leoveanu-Condrei Marius-Constantin Dinu Chris Callison-Burch and Sepp Hochreiter. 2024. Large language models can self-improve at web agent tasks. arXiv preprint arXiv:2405.20309(2024)."},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-industry.9"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-024-00944-1"},{"key":"e_1_3_2_1_80_1","first-page":"59708","article-title":"Androidinthewild: A large-scale dataset for android device control","volume":"36","author":"Rawles Christopher","year":"2023","unstructured":"Christopher Rawles, Alice Li, Daniel Rodriguez, Oriana Riva, and Timothy Lillicrap. 2023. Androidinthewild: A large-scale dataset for android device control. Advances in Neural Information Processing Systems, Vol. 36 (2023), 59708-59728.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_81_1","volume-title":"Infogent: An agent-based framework for web information aggregation. arXiv preprint arXiv:2410.19054(2024).","author":"Reddy Revanth Gangi","year":"2024","unstructured":"Revanth Gangi Reddy, Sagnik Mukherjee, Jeonghwan Kim, Zhenhailong Wang, Dilek Hakkani-Tur, and Heng Ji. 2024. Infogent: An agent-based framework for web information aggregation. arXiv preprint arXiv:2410.19054(2024)."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jnca.2012.02.011"},{"key":"e_1_3_2_1_83_1","volume-title":"Naviqate: Functionality-guided web application navigation. arXiv preprint arXiv:2409.10741(2024).","author":"Shahbandeh Mobina","year":"2024","unstructured":"Mobina Shahbandeh, Parsa Alian, Noor Nashid, and Ali Mesbah. 2024. Naviqate: Functionality-guided web application navigation. arXiv preprint arXiv:2409.10741(2024)."},{"key":"e_1_3_2_1_84_1","unstructured":"Huawen Shen Chang Liu Gengluo Li Xinlong Wang Yu Zhou Can Ma and Xiangyang Ji. 2024b. Falcon-UI: Understanding GUI Before Following User Instructions. arXiv preprint arXiv:2412.09362(2024)."},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"crossref","unstructured":"Junhong Shen Atishay Jain Zedian Xiao Ishan Amlekar Mouad Hadji Aaron Podolny and Ameet Talwalkar. 2024a. ScribeAgent: Towards Specialized Web Agents Using Production-Scale Workflow Data. arXiv preprint arXiv:2411.15004(2024).","DOI":"10.32388\/8VOG0O"},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"crossref","unstructured":"Segev Shlomov Aviad Sela Ido Levy Liane Galanti Roy Abitbol et al. 2024. From grounding to planning: Benchmarking bottlenecks in web agents. arXiv preprint arXiv:2409.01927(2024).","DOI":"10.3233\/FAIA251390"},{"key":"e_1_3_2_1_87_1","unstructured":"Significant Gravitas. [n.d.]. AutoGPT. https:\/\/agpt.co\/"},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"publisher","DOI":"10.7763\/IJMLC.2012.V2.137"},{"key":"e_1_3_2_1_89_1","volume-title":"Step: Stacked llm policies for web actions. arXiv preprint arXiv:2310.03720(2023).","author":"Sodhi Paloma","year":"2023","unstructured":"Paloma Sodhi, SRK Branavan, Yoav Artzi, and Ryan McDonald. 2023. Step: Stacked llm policies for web actions. arXiv preprint arXiv:2310.03720(2023)."},{"key":"e_1_3_2_1_90_1","volume-title":"Beyond Browsing: API-Based Web Agents. arXiv preprint arXiv:2410.16464(2024).","author":"Song Yueqi","year":"2024","unstructured":"Yueqi Song, Frank Xu, Shuyan Zhou, and Graham Neubig. 2024b. Beyond Browsing: API-Based Web Agents. arXiv preprint arXiv:2410.16464(2024)."},{"key":"e_1_3_2_1_91_1","unstructured":"Zirui Song Yaohang Li Meng Fang Zhenhao Chen Zecheng Shi Yuan Huang and Ling Chen. 2024a. MMAC-Copilot: Multi-modal Agent Collaboration Operating System Copilot. CoRR(2024)."},{"key":"e_1_3_2_1_92_1","volume-title":"NeurIPS 2024 Workshop on Open-World Agents.","author":"Tan Weihao","unstructured":"Weihao Tan, Wentao Zhang, Xinrun Xu, Haochong Xia, Gang Ding, Boyu Li, Bohan Zhou, Junpeng Yue, Jiechuan Jiang, Yewen Li, et al., [n.d.]. Cradle: Empowering Foundation Agents towards General Computer Control. In NeurIPS 2024 Workshop on Open-World Agents."},{"key":"e_1_3_2_1_93_1","volume-title":"Steward: Natural language web automation. arXiv preprint arXiv:2409.15441(2024).","author":"Tang Brian","year":"2024","unstructured":"Brian Tang and Kang G Shin. 2024. Steward: Natural language web automation. arXiv preprint arXiv:2409.15441(2024)."},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"crossref","unstructured":"Heyi Tao TV Sethuraman Michal Shlapentokh-Rothman and Derek Hoiem. 2023. WebWISE: Web Interface Control and Sequential Exploration with Large Language Models. CoRR(2023).","DOI":"10.18653\/v1\/2024.findings-naacl.234"},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605098.3635903"},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"publisher","DOI":"10.1145\/1135777.1135863"},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"crossref","unstructured":"Bo Wang Weiyi He Pengfei He Shenglai Zeng Zhen Xiang Yue Xing and Jiliang Tang. 2025. Unveiling Privacy Risks in LLM Agent Memory. arXiv preprint arXiv:2502.13172(2025).","DOI":"10.18653\/v1\/2025.acl-long.1227"},{"key":"e_1_3_2_1_98_1","first-page":"99040","article-title":"ALI-Agent: Assessing LLMs' Alignment with Human Values via Agent-based Evaluation","volume":"37","author":"Wang Han","year":"2024","unstructured":"Han Wang, An Zhang, Nguyen Duy Tai, Jun Sun, Tat-Seng Chua, et al., 2024b. ALI-Agent: Assessing LLMs' Alignment with Human Values via Agent-based Evaluation. Advances in Neural Information Processing Systems, Vol. 37 (2024), 99040-99088.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_99_1","volume-title":"Oscar: Operating system control via state-aware reasoning and re-planning. arXiv preprint arXiv:2410.18963(2024).","author":"Wang Xiaoqiang","year":"2024","unstructured":"Xiaoqiang Wang and Bang Liu. 2024. Oscar: Operating system control via state-aware reasoning and re-planning. arXiv preprint arXiv:2410.18963(2024)."},{"key":"e_1_3_2_1_100_1","doi-asserted-by":"crossref","unstructured":"Yiqin Wang Haoji Zhang Jingqi Tian and Yansong Tang. 2024c. Ponder & press: Advancing visual gui agent towards general computer control. arXiv preprint arXiv:2412.01268(2024).","DOI":"10.18653\/v1\/2025.findings-acl.76"},{"key":"e_1_3_2_1_101_1","unstructured":"Zora Zhiruo Wang Jiayuan Mao Daniel Fried and Graham Neubig. 2024a. Agent workflow memory. arXiv preprint arXiv:2409.07429(2024)."},{"key":"e_1_3_2_1_102_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824-24837."},{"key":"e_1_3_2_1_103_1","volume-title":"Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu.","author":"Wen Hao","year":"2023","unstructured":"Hao Wen, Yuanchun Li, Guohong Liu, Shanhui Zhao, Tao Yu, Toby Jia-Jun Li, Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu. 2023. Empowering llm to use smartphone for intelligent task automation. arXiv preprint arXiv:2308.15272, Vol. 4 (2023)."},{"key":"e_1_3_2_1_104_1","volume-title":"Dissecting Adversarial Robustness of Multimodal LM Agents. In The Thirteenth International Conference on Learning Representations.","author":"Wu Chen Henry","unstructured":"Chen Henry Wu, Rishi Rajesh Shah, Jing Yu Koh, Russ Salakhutdinov, Daniel Fried, and Aditi Raghunathan. [n.d.]. Dissecting Adversarial Robustness of Multimodal LM Agents. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_105_1","doi-asserted-by":"publisher","DOI":"10.1145\/1321440.1321449"},{"key":"e_1_3_2_1_106_1","volume-title":"Wipi: A new web threat for llm-driven web agents. arXiv preprint arXiv:2402.16965(2024).","author":"Wu Fangzhou","year":"2024","unstructured":"Fangzhou Wu, Shutong Wu, Yulong Cao, and Chaowei Xiao. 2024b. Wipi: A new web threat for llm-driven web agents. arXiv preprint arXiv:2402.16965(2024)."},{"key":"e_1_3_2_1_107_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11280-024-01291-2"},{"key":"e_1_3_2_1_108_1","volume-title":"Autogen: Enabling next-gen llm applications via multi-agent conversation. arXiv preprint arXiv:2308.08155(2023).","author":"Wu Qingyun","year":"2023","unstructured":"Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Beibin Li, Erkang Zhu, Li Jiang, Xiaoyun Zhang, Shaokun Zhang, Jiale Liu, et al., 2023. Autogen: Enabling next-gen llm applications via multi-agent conversation. arXiv preprint arXiv:2308.08155(2023)."},{"key":"e_1_3_2_1_109_1","unstructured":"Qinzhuo Wu Weikai Xu Wei Liu Tao Tan Jianfeng Liu Ang Li Jian Luan Bin Wang and Shuo Shang. 2024d. MobileVLM: A vision-language model for better intra-and inter-ui understanding. arXiv preprint arXiv:2409.14818(2024)."},{"key":"e_1_3_2_1_110_1","volume-title":"OS-Copilot: Towards Generalist Computer Agents with Self-Improvement. In ICLR 2024 Workshop on Large Language Model (LLM) Agents.","author":"Wu Zhiyong","year":"2024","unstructured":"Zhiyong Wu, Chengcheng Han, Zichen Ding, Zhenmin Weng, Zhoumianze Liu, Shunyu Yao, Tao Yu, and Lingpeng Kong. 2024a. OS-Copilot: Towards Generalist Computer Agents with Self-Improvement. In ICLR 2024 Workshop on Large Language Model (LLM) Agents."},{"key":"e_1_3_2_1_111_1","volume-title":"Paul Pu Liang, et al","author":"Wu Zhiyong","year":"2024","unstructured":"Zhiyong Wu, Zhenyu Wu, Fangzhi Xu, Yian Wang, Qiushi Sun, Chengyou Jia, Kanzhi Cheng, Zichen Ding, Liheng Chen, Paul Pu Liang, et al., 2024c. OS-Atlas: A foundation action model for generalist GUI agents. arXiv preprint arXiv:2410.23218(2024)."},{"key":"e_1_3_2_1_112_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4222-0"},{"key":"e_1_3_2_1_113_1","volume-title":"Advweb: Controllable black-box attacks on vlm-powered web agents. arXiv preprint arXiv:2410.17401(2024).","author":"Xu Chejian","year":"2024","unstructured":"Chejian Xu, Mintong Kang, Jiawei Zhang, Zeyi Liao, Lingbo Mo, Mengqi Yuan, Huan Sun, and Bo Li. 2024a. Advweb: Controllable black-box attacks on vlm-powered web agents. arXiv preprint arXiv:2410.17401(2024)."},{"key":"e_1_3_2_1_114_1","volume-title":"Benjamin Van Durme, and Daniel Khashabi","author":"Xu Kevin","year":"2024","unstructured":"Kevin Xu, Yeganeh Kordi, Tanay Nayak, Ado Asija, Yizhong Wang, Kate Sanders, Adam Byerly, Jingyu Zhang, Benjamin Van Durme, and Daniel Khashabi. 2024b. Tur [k] ingbench: A challenge benchmark for web agents. arXiv preprint arXiv:2403.11905(2024)."},{"key":"e_1_3_2_1_115_1","unstructured":"Yiheng Xu Dunjie Lu Zhennan Shen Junli Wang Zekun Wang Yuchen Mao Caiming Xiong and Tao Yu. 2024c. AgentTrek: Agent Trajectory Synthesis via Guiding Replay with Web Tutorials. arXiv preprint arXiv:2412.09605(2024)."},{"key":"e_1_3_2_1_116_1","volume-title":"Aguvis: Unified Pure Vision Agents for Autonomous GUI Interaction. arXiv preprint arXiv:2412.04454(2024).","author":"Xu Yiheng","year":"2024","unstructured":"Yiheng Xu, Zekun Wang, Junli Wang, Dunjie Lu, Tianbao Xie, Amrita Saha, Doyen Sahoo, Tao Yu, and Caiming Xiong. 2024d. Aguvis: Unified Pure Vision Agents for Autonomous GUI Interaction. arXiv preprint arXiv:2412.04454(2024)."},{"key":"e_1_3_2_1_117_1","unstructured":"Jianwei Yang Hao Zhang Feng Li Xueyan Zou Chunyuan Li and Jianfeng Gao. 2023b. Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v. arXiv preprint arXiv:2310.11441(2023)."},{"key":"e_1_3_2_1_118_1","volume-title":"AgentOccam: A Simple Yet Strong Baseline for LLM-Based Web Agents. In The Thirteenth International Conference on Learning Representations.","author":"Yang Ke","year":"2024","unstructured":"Ke Yang, Yao Liu, Sapana Chaudhary, Rasool Fakoor, Pratik Chaudhari, George Karypis, and Huzefa Rangwala. 2024. AgentOccam: A Simple Yet Strong Baseline for LLM-Based Web Agents. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_119_1","volume-title":"The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421","author":"Yang Zhengyuan","year":"2023","unstructured":"Zhengyuan Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Chung-Ching Lin, Zicheng Liu, and Lijuan Wang. 2023a. The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421, Vol. 9, 1 (2023), 1."},{"key":"e_1_3_2_1_120_1","volume-title":"R-judge: Benchmarking safety risk awareness for llm agents. arXiv preprint arXiv:2401.10019(2024).","author":"Yuan Tongxin","year":"2024","unstructured":"Tongxin Yuan, Zhiwei He, Lingzhong Dong, Yiming Wang, Ruijie Zhao, Tian Xia, Lizhen Xu, Binglin Zhou, Fangqi Li, Zhuosheng Zhang, et al., 2024. R-judge: Benchmarking safety risk awareness for llm agents. arXiv preprint arXiv:2401.10019(2024)."},{"key":"e_1_3_2_1_121_1","volume-title":"UFO: A UI-Focused Agent for Windows OS Interaction. CoRR(2024).","author":"Zhang Chaoyun","year":"2024","unstructured":"Chaoyun Zhang, Liqun Li, Shilin He, Xu Zhang, Bo Qiao, Si Qin, Minghua Ma, Yu Kang, Qingwei Lin, Saravan Rajmohan, et al., 2024a. UFO: A UI-Focused Agent for Windows OS Interaction. CoRR(2024)."},{"key":"e_1_3_2_1_122_1","doi-asserted-by":"crossref","unstructured":"Jiwen Zhang Jihao Wu Yihua Teng Minghui Liao Nuo Xu Xiao Xiao Zhongyu Wei and Duyu Tang. 2024c. Android in the zoo: Chain-of-action-thought for gui agents. arXiv preprint arXiv:2403.02713(2024).","DOI":"10.18653\/v1\/2024.findings-emnlp.702"},{"key":"e_1_3_2_1_123_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.70"},{"key":"e_1_3_2_1_124_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2019.03.004"},{"key":"e_1_3_2_1_125_1","volume-title":"Privacyasst: Safeguarding user privacy in tool-using large language model agents","author":"Zhang Xinyu","year":"2024","unstructured":"Xinyu Zhang, Huiyu Xu, Zhongjie Ba, Zhibo Wang, Yuan Hong, Jian Liu, Zhan Qin, and Kui Ren. 2024d. Privacyasst: Safeguarding user privacy in tool-using large language model agents. IEEE Transactions on Dependable and Secure Computing(2024)."},{"key":"e_1_3_2_1_126_1","volume-title":"Webpilot: A versatile and autonomous multi-agent system for web task execution with strategic exploration. arXiv preprint arXiv:2408.15978(2024).","author":"Zhang Yao","year":"2024","unstructured":"Yao Zhang, Zijian Ma, Yunpu Ma, Zhen Han, Yu Wu, and Volker Tresp. 2024b. Webpilot: A versatile and autonomous multi-agent system for web task execution with strategic exploration. arXiv preprint arXiv:2408.15978(2024)."},{"key":"e_1_3_2_1_127_1","first-page":"3132","article-title":"You Only Look at Screens","volume":"2024","author":"Zhang Zhuosheng","year":"2024","unstructured":"Zhuosheng Zhang and Aston Zhang. 2024. You Only Look at Screens: Multimodal Chain-of-Action Agents. In Findings of the Association for Computational Linguistics ACL 2024. 3132-3149.","journal-title":"Multimodal Chain-of-Action Agents. In Findings of the Association for Computational Linguistics ACL"},{"key":"e_1_3_2_1_128_1","doi-asserted-by":"crossref","unstructured":"Zihuai Zhao Wenqi Fan Jiatong Li Yunqing Liu Xiaowei Mei Yiqi Wang Zhen Wen Fei Wang Xiangyu Zhao Jiliang Tang et al. 2024. Recommender systems in the era of large language models (llms). IEEE Transactions on Knowledge and Data Engineering(2024).","DOI":"10.1109\/TKDE.2024.3392335"},{"key":"e_1_3_2_1_129_1","volume-title":"Synapse: Trajectory-as-Exemplar Prompting with Memory for Computer Control. In NeurIPS 2023 Foundation Models for Decision Making Workshop.","author":"Zheng Longtao","unstructured":"Longtao Zheng, Rundong Wang, Xinrun Wang, and Bo An. [n.d.]. Synapse: Trajectory-as-Exemplar Prompting with Memory for Computer Control. In NeurIPS 2023 Foundation Models for Decision Making Workshop."},{"key":"e_1_3_2_1_130_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29946"},{"key":"e_1_3_2_1_131_1","doi-asserted-by":"publisher","DOI":"10.1145\/3675812.3675871"},{"key":"e_1_3_2_1_132_1","volume-title":"Webarena: A realistic web environment for building autonomous agents. arXiv preprint arXiv:2307.13854(2023).","author":"Zhou Shuyan","year":"2023","unstructured":"Shuyan Zhou, Frank F Xu, Hao Zhu, Xuhui Zhou, Robert Lo, Abishek Sridhar, Xianyi Cheng, Tianyue Ou, Yonatan Bisk, Daniel Fried, et al., 2023. Webarena: A realistic web environment for building autonomous agents. arXiv preprint arXiv:2307.13854(2023)."}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3736555","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T17:55:29Z","timestamp":1777571729000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3736555"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":132,"alternative-id":["10.1145\/3711896.3736555","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3736555","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}