{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T10:18:18Z","timestamp":1769163498861,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T00:00:00Z","timestamp":1746662400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,8]]},"DOI":"10.1145\/3701716.3717526","type":"proceedings-article","created":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T16:20:01Z","timestamp":1748017201000},"page":"2391-2400","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Agent-Initiated Interaction in Phone UI Automation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-0097-7417","authenticated-orcid":false,"given":"Noam","family":"Kahlon","sequence":"first","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2537-9018","authenticated-orcid":false,"given":"Guy","family":"Rom","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1451-3438","authenticated-orcid":false,"given":"Anatoly","family":"Efros","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3553-3611","authenticated-orcid":false,"given":"Filippo","family":"Galgani","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4337-4954","authenticated-orcid":false,"given":"Omri","family":"Berkovitch","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2298-0605","authenticated-orcid":false,"given":"Sapir","family":"Caduri","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4377-3989","authenticated-orcid":false,"given":"William E.","family":"Bishop","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4514-009X","authenticated-orcid":false,"given":"Oriana","family":"Riva","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0165-606X","authenticated-orcid":false,"given":"Ido","family":"Dagan","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, CA, USA and Bar-Ilan University, Ramat Gan, Israel"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,5,23]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"William E Bishop Alice Li Christopher Rawles and Oriana Riva. 2024. Latent State Estimation Helps UI Agents to Reason. arxiv: 2405.11120 [cs.AI]"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_18"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3166054.3166058"},{"key":"e_1_3_2_2_4_1","volume-title":"Mind2Web: Towards a Generalist Agent for the Web. arXiv preprint arXiv:2306.06070","author":"Deng Xiang","year":"2023","unstructured":"Xiang Deng, Yu Gu, Boyuan Zheng, Shijie Chen, Samuel Stevens, Boshi Wang, Huan Sun, and Yu Su. 2023. Mind2Web: Towards a Generalist Agent for the Web. arXiv preprint arXiv:2306.06070 (2023)."},{"key":"e_1_3_2_2_5_1","volume-title":"Cogagent: A visual language model for gui agents. arXiv preprint arXiv:2312.08914","author":"Hong Wenyi","year":"2023","unstructured":"Wenyi Hong, Weihan Wang, Qingsong Lv, Jiazheng Xu, Wenmeng Yu, Junhui Ji, Yan Wang, Zihan Wang, Yuxiao Dong, Ming Ding, et al. 2023. Cogagent: A visual language model for gui agents. arXiv preprint arXiv:2312.08914 (2023)."},{"key":"e_1_3_2_2_6_1","volume-title":"Sang-goo Lee, and Taeuk Kim.","author":"Kim Hyuhng Joon","year":"2024","unstructured":"Hyuhng Joon Kim, Youna Kim, Cheonbok Park, Junyeob Kim, Choonghyun Park, Kang Min Yoo, Sang-goo Lee, and Taeuk Kim. 2024. Aligning Language Models to Explicitly Handle Ambiguity. arXiv preprint arXiv:2404.11972 (2024)."},{"key":"e_1_3_2_2_7_1","first-page":"92130","article-title":"On the effects of data scale on ui control agents","volume":"37","author":"Li Wei","year":"2025","unstructured":"Wei Li, William E Bishop, Alice Li, Christopher Rawles, Folawiyo Campbell-Ajala, Divya Tyamagundlu, and Oriana Riva. 2025. On the effects of data scale on ui control agents. Advances in Neural Information Processing Systems, Vol. 37 (2025), 92130--92154.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_8_1","volume-title":"UINav: A maker of UI automation agents. arXiv preprint arXiv:2312.10170","author":"Li Wei","year":"2023","unstructured":"Wei Li, Fu-Lin Hsu, Will Bishop, Folawiyo Campbell-Ajala, Oriana Riva, and Max Lin. 2023. UINav: A maker of UI automation agents. arXiv preprint arXiv:2312.10170 (2023)."},{"key":"e_1_3_2_2_9_1","volume-title":"Mapping natural language instructions to mobile UI action sequences. arXiv preprint arXiv:2005.03776","author":"Li Yang","year":"2020","unstructured":"Yang Li, Jiacong He, Xin Zhou, Yuan Zhang, and Jason Baldridge. 2020. Mapping natural language instructions to mobile UI action sequences. arXiv preprint arXiv:2005.03776 (2020)."},{"key":"e_1_3_2_2_10_1","unstructured":"Yuanchun Li Hao Wen Weijun Wang Xiangyu Li Yizhen Yuan Guohong Liu Jiacheng Liu Wenxing Xu Xiang Wang Yi Sun et al. 2024. Personal llm agents: Insights and survey about the capability efficiency and security. arXiv preprint arXiv:2401.05459 (2024)."},{"key":"e_1_3_2_2_11_1","volume-title":"Weblinx: Real-world website navigation with multi-turn dialogue. arXiv preprint arXiv:2402.05930","author":"L\u00f9 Xing Han","year":"2024","unstructured":"Xing Han L\u00f9, Zden\u011bk Kasner, and Siva Reddy. 2024. Weblinx: Real-world website navigation with multi-turn dialogue. arXiv preprint arXiv:2402.05930 (2024)."},{"key":"e_1_3_2_2_12_1","volume-title":"Android in the wild: A large-scale dataset for android device control. arXiv preprint arXiv:2307.10088","author":"Rawles Christopher","year":"2023","unstructured":"Christopher Rawles, Alice Li, Daniel Rodriguez, Oriana Riva, and Timothy Lillicrap. 2023. Android in the wild: A large-scale dataset for android device control. arXiv preprint arXiv:2307.10088 (2023)."},{"key":"e_1_3_2_2_13_1","unstructured":"Machel Reid Nikolay Savinov Denis Teplyashin Dmitry Lepikhin Timothy Lillicrap Jean-baptiste Alayrac Radu Soricut Angeliki Lazaridou Orhan Firat Julian Schrittwieser et al. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)."},{"key":"e_1_3_2_2_14_1","unstructured":"Allen Z Ren Anushri Dixit Alexandra Bodrova Sumeet Singh Stephen Tu Noah Brown Peng Xu Leila Takayama Fei Xia Jake Varley et al. 2023. Robots that ask for help: Uncertainty alignment for large language model planners. arXiv preprint arXiv:2307.01928 (2023)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/NOMS54207.2022.9789886"},{"key":"e_1_3_2_2_16_1","volume-title":"META-GUI: towards multi-modal conversational agents on mobile GUI. arXiv preprint arXiv:2205.11029","author":"Sun Liangtai","year":"2022","unstructured":"Liangtai Sun, Xingyu Chen, Lu Chen, Tianle Dai, Zichen Zhu, and Kai Yu. 2022. META-GUI: towards multi-modal conversational agents on mobile GUI. arXiv preprint arXiv:2205.11029 (2022)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3461778.3462124"},{"key":"e_1_3_2_2_18_1","volume-title":"Androidenv: A reinforcement learning platform for android. arXiv preprint arXiv:2105.13231","author":"Toyama Daniel","year":"2021","unstructured":"Daniel Toyama, Philippe Hamel, Anita Gergely, Gheorghe Comanici, Amelia Glaese, Zafarali Ahmed, Tyler Jackson, Shibl Mourad, and Doina Precup. 2021. Androidenv: A reinforcement learning platform for android. arXiv preprint arXiv:2105.13231 (2021)."},{"key":"e_1_3_2_2_19_1","volume-title":"UGIF: UI Grounded Instruction Following. arXiv preprint arXiv:2211.07615","author":"Venkatesh Sagar Gubbi","year":"2022","unstructured":"Sagar Gubbi Venkatesh, Partha Talukdar, and Srini Narayanan. 2022. UGIF: UI Grounded Instruction Following. arXiv preprint arXiv:2211.07615 (2022)."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580895"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472749.3474765"},{"key":"e_1_3_2_2_22_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824--24837."},{"key":"e_1_3_2_2_23_1","volume-title":"Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu.","author":"Wen Hao","year":"2023","unstructured":"Hao Wen, Yuanchun Li, Guohong Liu, Shanhui Zhao, Tao Yu, Toby Jia-Jun Li, Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu. 2023. Empowering llm to use smartphone for intelligent task automation. arXiv preprint arXiv:2308.15272 (2023)."},{"key":"e_1_3_2_2_24_1","unstructured":"An Yan Zhengyuan Yang Wanrong Zhu Kevin Lin Linjie Li Jianfeng Wang Jianwei Yang Yiwu Zhong Julian McAuley Jianfeng Gao et al. 2023. Gpt-4v in wonderland: Large multimodal models for zero-shot smartphone gui navigation. arXiv preprint arXiv:2311.07562 (2023)."},{"key":"e_1_3_2_2_25_1","volume-title":"Appagent: Multimodal agents as smartphone users. arXiv preprint arXiv:2312.13771","author":"Yang Zhao","year":"2023","unstructured":"Zhao Yang, Jiaxuan Liu, Yucheng Han, Xin Chen, Zebiao Huang, Bin Fu, and Gang Yu. 2023. Appagent: Multimodal agents as smartphone users. arXiv preprint arXiv:2312.13771 (2023)."},{"key":"e_1_3_2_2_26_1","volume-title":"You only look at screens: Multimodal chain-of-action agents. arXiv preprint arXiv:2309.11436","author":"Zhan Zhuosheng","year":"2023","unstructured":"Zhuosheng Zhan and Aston Zhang. 2023. You only look at screens: Multimodal chain-of-action agents. arXiv preprint arXiv:2309.11436 (2023)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445186"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11431-020-1692-3"}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Companion Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717526","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3701716.3717526","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T03:06:32Z","timestamp":1759892792000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717526"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,8]]},"references-count":28,"alternative-id":["10.1145\/3701716.3717526","10.1145\/3701716"],"URL":"https:\/\/doi.org\/10.1145\/3701716.3717526","relation":{},"subject":[],"published":{"date-parts":[[2025,5,8]]},"assertion":[{"value":"2025-05-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}