{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T00:19:45Z","timestamp":1759969185923,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T00:00:00Z","timestamp":1746662400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,8]]},"DOI":"10.1145\/3701716.3717525","type":"proceedings-article","created":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T16:20:01Z","timestamp":1748017201000},"page":"2381-2390","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Identifying User Goals From UI Trajectories"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-4337-4954","authenticated-orcid":false,"given":"Omri","family":"Berkovitch","sequence":"first","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2298-0605","authenticated-orcid":false,"given":"Sapir","family":"Caduri","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0097-7417","authenticated-orcid":false,"given":"Noam","family":"Kahlon","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1451-3438","authenticated-orcid":false,"given":"Anatoly","family":"Efros","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0573-1075","authenticated-orcid":false,"given":"Avi","family":"Caciularu","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0165-606X","authenticated-orcid":false,"given":"Ido","family":"Dagan","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA and Bar-Ilan University, Ramat Gan, Israel"}]}],"member":"320","published-online":{"date-parts":[[2025,5,23]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/0004-3702(93)90060-O"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126594.3126651"},{"key":"e_1_3_2_2_4_1","volume-title":"Mind2Web: Towards a Generalist Agent for the Web. arXiv preprint arXiv:2306.06070","author":"Deng Xiang","year":"2023","unstructured":"Xiang Deng, Yu Gu, Boyuan Zheng, Shijie Chen, Samuel Stevens, Boshi Wang, Huan Sun, and Yu Su. 2023. Mind2Web: Towards a Generalist Agent for the Web. arXiv preprint arXiv:2306.06070 (2023)."},{"key":"e_1_3_2_2_5_1","volume-title":"Shixiang Shane Gu, and Izzeddin Gur","author":"Furuta Hiroki","year":"2023","unstructured":"Hiroki Furuta, Kuang-Huei Lee, Ofir Nachum, Yutaka Matsuo, Aleksandra Faust, Shixiang Shane Gu, and Izzeddin Gur. 2023. Multimodal web navigation with instruction-finetuned foundation models. arXiv preprint arXiv:2305.11854 (2023)."},{"key":"e_1_3_2_2_6_1","volume-title":"A new model of plan recognition. arXiv preprint arXiv:1301.6700","author":"Goldman Robert P","year":"2013","unstructured":"Robert P Goldman, Christopher W Geib, and Christopher A Miller. 2013. A new model of plan recognition. arXiv preprint arXiv:1301.6700 (2013)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470482.3479636"},{"key":"e_1_3_2_2_8_1","volume-title":"Proceedings of the 31st. AAAI Conference: Plan, Activity and Intent Recognition Workshop","author":"Granada Roger L","year":"2017","unstructured":"Roger L Granada, Ramon Fraga Pereira, Juarez Monteiro, Duncan Dubugras Alcoba Ruiz, Rodrigo Coelho Barros, and Felipe Rech Meneguzzi. 2017. Hybrid activity and plan recognition for video streams. In Proceedings of the 31st. AAAI Conference: Plan, Activity and Intent Recognition Workshop, 2017, Estados Unidos."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1609\/icwsm.v8i1.14505"},{"key":"e_1_3_2_2_10_1","volume-title":"Mustafa Safdari, Yutaka Matsuo, Douglas Eck, and Aleksandra Faust.","author":"Gur Izzeddin","year":"2023","unstructured":"Izzeddin Gur, Hiroki Furuta, Austin Huang, Mustafa Safdari, Yutaka Matsuo, Douglas Eck, and Aleksandra Faust. 2023. A real-world webagent with planning, long context understanding, and program synthesis. arXiv preprint arXiv:2307.12856 (2023)."},{"key":"e_1_3_2_2_11_1","volume-title":"Mining purchase intent in twitter. Computaci\u00f3n y Sistemas 23, 3","author":"Haque Rejwanul","year":"2019","unstructured":"Rejwanul Haque, Arvind Ramadurai, Mohammed Hasanuzzaman, and AndyWay. 2019. Mining purchase intent in twitter. Computaci\u00f3n y Sistemas 23, 3 (2019), 871--881."},{"key":"e_1_3_2_2_12_1","volume-title":"WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models. arXiv preprint arXiv:2401.13919","author":"He Hongliang","year":"2024","unstructured":"Hongliang He, Wenlin Yao, Kaixin Ma, Wenhao Yu, Yong Dai, Hongming Zhang, Zhenzhong Lan, and Dong Yu. 2024. WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models. arXiv preprint arXiv:2401.13919 (2024)."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.830"},{"key":"e_1_3_2_2_14_1","volume-title":"Cogagent: A visual language model for gui agents. arXiv preprint arXiv:2312.08914","author":"Hong Wenyi","year":"2023","unstructured":"Wenyi Hong, Weihan Wang, Qingsong Lv, Jiazheng Xu, Wenmeng Yu, Junhui Ji, Yan Wang, Zihan Wang, Yuxiao Dong, Ming Ding, et al. 2023. Cogagent: A visual language model for gui agents. arXiv preprint arXiv:2312.08914 (2023)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/1526709.1526773"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00224"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73113-6_10"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"crossref","unstructured":"Henry A Kautz et al. 1991. A formal theory of plan recognition and its implementation. Reasoning about plans (1991) 69--125.","DOI":"10.1016\/B978-1-55860-137-6.50008-X"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.3390\/s22010323"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/INDICON49873.2020.9342516"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/1964897.1964918"},{"key":"e_1_3_2_2_22_1","volume-title":"A survey of intent classification and slot-filling datasets for task-oriented dialog. arXiv preprint arXiv:2207.13211","author":"Larson Stefan","year":"2022","unstructured":"Stefan Larson and Kevin Leach. 2022. A survey of intent classification and slot-filling datasets for task-oriented dialog. arXiv preprint arXiv:2207.13211 (2022)."},{"key":"e_1_3_2_2_23_1","volume-title":"UINav: A maker of UI automation agents. arXiv preprint arXiv:2312.10170","author":"Li Wei","year":"2023","unstructured":"Wei Li, Fu-Lin Hsu, Will Bishop, Folawiyo Campbell-Ajala, Oriana Riva, and Max Lin. 2023. UINav: A maker of UI automation agents. arXiv preprint arXiv:2312.10170 (2023)."},{"key":"e_1_3_2_2_24_1","volume-title":"Mapping natural language instructions to mobile UI action sequences. arXiv preprint arXiv:2005.03776","author":"Li Yang","year":"2020","unstructured":"Yang Li, Jiacong He, Xin Zhou, Yuan Zhang, and Jason Baldridge. 2020. Mapping natural language instructions to mobile UI action sequences. arXiv preprint arXiv:2005.03776 (2020)."},{"key":"e_1_3_2_2_25_1","unstructured":"Yuanchun Li Hao Wen Weijun Wang Xiangyu Li Yizhen Yuan Guohong Liu Jiacheng Liu Wenxing Xu Xiang Wang Yi Sun et al. 2024. Personal llm agents: Insights and survey about the capability efficiency and security. arXiv preprint arXiv:2401.05459 (2024)."},{"key":"e_1_3_2_2_26_1","volume-title":"Location-based activity recognition. Advances in neural information processing systems 18","author":"Liao Lin","year":"2005","unstructured":"Lin Liao, Dieter Fox, and Henry Kautz. 2005. Location-based activity recognition. Advances in neural information processing systems 18 (2005)."},{"key":"e_1_3_2_2_27_1","volume-title":"Autonomous evaluation and refinement of digital agents. arXiv preprint arXiv:2404.06474","author":"Pan Jiayi","year":"2024","unstructured":"Jiayi Pan, Yichi Zhang, Nicholas Tomlin, Yifei Zhou, Sergey Levine, and Alane Suhr. 2024. Autonomous evaluation and refinement of digital agents. arXiv preprint arXiv:2404.06474 (2024)."},{"key":"e_1_3_2_2_28_1","volume-title":"Android in the wild: A large-scale dataset for android device control. arXiv preprint arXiv:2307.10088","author":"Rawles Christopher","year":"2023","unstructured":"Christopher Rawles, Alice Li, Daniel Rodriguez, Oriana Riva, and Timothy Lillicrap. 2023. Android in the wild: A large-scale dataset for android device control. arXiv preprint arXiv:2307.10088 (2023)."},{"key":"e_1_3_2_2_29_1","unstructured":"Machel Reid Nikolay Savinov Denis Teplyashin Dmitry Lepikhin Timothy Lillicrap Jean-baptiste Alayrac Radu Soricut Angeliki Lazaridou Orhan Firat Julian Schrittwieser et al. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2019.2954966"},{"key":"e_1_3_2_2_31_1","volume-title":"David Pynadath, and Robert P Goldman.","author":"Sukthankar Gita","year":"2014","unstructured":"Gita Sukthankar, Christopher Geib, Hung Hai Bui, David Pynadath, and Robert P Goldman. 2014. Plan, activity, and intent recognition: Theory and practice. Newnes."},{"key":"e_1_3_2_2_32_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems 35 (2022), 24824--24837."},{"key":"e_1_3_2_2_33_1","volume-title":"Droidbot-gpt: Gpt-powered ui automation for android. arXiv preprint arXiv:2304.07061","author":"Wen Hao","year":"2023","unstructured":"Hao Wen, Hongming Wang, Jiaxuan Liu, and Yuanchun Li. 2023. Droidbot-gpt: Gpt-powered ui automation for android. arXiv preprint arXiv:2304.07061 (2023)."},{"key":"e_1_3_2_2_34_1","volume-title":"Grounding open-domain instructions to automate web support tasks. arXiv preprint arXiv:2103.16057","author":"Xu Nancy","year":"2021","unstructured":"Nancy Xu, Sam Masling, Michael Du, Giovanni Campagna, Larry Heck, James Landay, and Monica S Lam. 2021. Grounding open-domain instructions to automate web support tasks. arXiv preprint arXiv:2103.16057 (2021)."},{"key":"e_1_3_2_2_35_1","volume-title":"Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v. arXiv preprint arXiv:2310.11441","author":"Yang Jianwei","year":"2023","unstructured":"Jianwei Yang, Hao Zhang, Feng Li, Xueyan Zou, Chunyuan Li, and Jianfeng Gao. 2023. Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v. arXiv preprint arXiv:2310.11441 (2023)."},{"key":"e_1_3_2_2_36_1","volume-title":"Appagent: Multimodal agents as smartphone users. arXiv preprint arXiv:2312.13771","author":"Yang Zhao","year":"2023","unstructured":"Zhao Yang, Jiaxuan Liu, Yucheng Han, Xin Chen, Zebiao Huang, Bin Fu, and Gang Yu. 2023. Appagent: Multimodal agents as smartphone users. arXiv preprint arXiv:2312.13771 (2023)."},{"key":"e_1_3_2_2_37_1","first-page":"20744","article-title":"Webshop: Towards scalable real-world web interaction with grounded language agents","volume":"35","author":"Yao Shunyu","year":"2022","unstructured":"Shunyu Yao, Howard Chen, John Yang, and Karthik Narasimhan. 2022. Webshop: Towards scalable real-world web interaction with grounded language agents. Advances in Neural Information Processing Systems 35 (2022), 20744--20757.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547906"},{"key":"e_1_3_2_2_39_1","volume-title":"Android in the Zoo: Chain-of-Action-Thought for GUI Agents. arXiv preprint arXiv:2403.02713","author":"Zhang Jiwen","year":"2024","unstructured":"Jiwen Zhang, Jihao Wu, Yihua Teng, Minghui Liao, Nuo Xu, Xiao Xiao, Zhongyu Wei, and Duyu Tang. 2024. Android in the Zoo: Chain-of-Action-Thought for GUI Agents. arXiv preprint arXiv:2403.02713 (2024)."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3073267"},{"key":"e_1_3_2_2_41_1","volume-title":"if grounded. arXiv preprint arXiv:2401.01614","author":"Zheng Boyuan","year":"2024","unstructured":"Boyuan Zheng, Boyu Gou, Jihyung Kil, Huan Sun, and Yu Su. 2024. Gpt-4v (ision) is a generalist web agent, if grounded. arXiv preprint arXiv:2401.01614 (2024)."},{"key":"e_1_3_2_2_42_1","volume-title":"Webarena: A realistic web environment for building autonomous agents. arXiv preprint arXiv:2307.13854","author":"Zhou Shuyan","year":"2023","unstructured":"Shuyan Zhou, Frank F Xu, Hao Zhu, Xuhui Zhou, Robert Lo, Abishek Sridhar, Xianyi Cheng, Yonatan Bisk, Daniel Fried, Uri Alon, et al. 2023. Webarena: A realistic web environment for building autonomous agents. arXiv preprint arXiv:2307.13854 (2023)."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Sydney NSW Australia","acronym":"WWW '25"},"container-title":["Companion Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717525","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3701716.3717525","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T03:06:12Z","timestamp":1759892772000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717525"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,8]]},"references-count":42,"alternative-id":["10.1145\/3701716.3717525","10.1145\/3701716"],"URL":"https:\/\/doi.org\/10.1145\/3701716.3717525","relation":{},"subject":[],"published":{"date-parts":[[2025,5,8]]},"assertion":[{"value":"2025-05-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}