{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:15:28Z","timestamp":1777655728719,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":92,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,24]],"date-time":"2025-03-24T00:00:00Z","timestamp":1742774400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,24]]},"DOI":"10.1145\/3708359.3712153","type":"proceedings-article","created":{"date-parts":[[2025,3,19]],"date-time":"2025-03-19T12:50:34Z","timestamp":1742388634000},"page":"727-744","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["From Interaction to Impact: Towards Safer AI Agent Through Understanding and Evaluating Mobile UI Operation Impacts"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8708-1429","authenticated-orcid":false,"given":"Zhuohao (Jerry)","family":"Zhang","sequence":"first","affiliation":[{"name":"Information School, University of Washington, Seattle, Washington, USA,"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8951-2878","authenticated-orcid":false,"given":"Eldon","family":"Schoop","sequence":"additional","affiliation":[{"name":"Apple, Seattle, Washington, USA,"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6880-8546","authenticated-orcid":false,"given":"Jeffrey","family":"Nichols","sequence":"additional","affiliation":[{"name":"Apple Inc, San Diego, California, USA,"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2229-5287","authenticated-orcid":false,"given":"Anuj","family":"Mahajan","sequence":"additional","affiliation":[{"name":"Machine Learning Foundations, Apple, Cupertino, California, USA,"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0935-4745","authenticated-orcid":false,"given":"Amanda","family":"Swearngin","sequence":"additional","affiliation":[{"name":"Apple, Seattle, Washington, USA,"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,3,24]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"2024. Heaviside step function. https:\/\/en.wikipedia.org\/w\/index.php?title=Heaviside_step_function&oldid=1247145934 Page Version ID: 1247145934."},{"key":"e_1_3_3_3_3_2","unstructured":"2024. Jaccard index. https:\/\/en.wikipedia.org\/w\/index.php?title=Jaccard_index&oldid=1247603955 Page Version ID: 1247603955."},{"key":"e_1_3_3_3_4_2","unstructured":"Anisha Agarwal Aaron Chan Shubham Chandel Jinu Jang Shaun Miller Roshanak\u00a0Zilouchian Moghaddam Yevhen Mohylevskyy Neel Sundaresan and Michele Tufano. 2024. Copilot Evaluation Harness: Evaluating LLM-Guided Software Programming. ArXiv abs\/2402.14261 (2024). https:\/\/api.semanticscholar.org\/CorpusID:267782462"},{"key":"e_1_3_3_3_5_2","unstructured":"Michael Ahn Anthony Brohan Noah Brown Yevgen Chebotar Omar Cortes Byron David Chelsea Finn Chuyuan Fu Keerthana Gopalakrishnan Karol Hausman Alex Herzog Daniel Ho Jasmine Hsu Julian Ibarz Brian Ichter Alex Irpan Eric Jang Rosario\u00a0Jauregui Ruano Kyle Jeffrey Sally Jesmonth Nikhil\u00a0J Joshi Ryan Julian Dmitry Kalashnikov Yuheng Kuang Kuang-Huei Lee Sergey Levine Yao Lu Linda Luu Carolina Parada Peter Pastor Jornell Quiambao Kanishka Rao Jarek Rettinghouse Diego Reyes Pierre Sermanet Nicolas Sievers Clayton Tan Alexander Toshev Vincent Vanhoucke Fei Xia Ted Xiao Peng Xu Sichun Xu Mengyuan Yan and Andy Zeng. 2022. Do As I Can Not As I Say: Grounding Language in Robotic Affordances. arxiv:https:\/\/arXiv.org\/abs\/2204.01691\u00a0[cs.RO] https:\/\/arxiv.org\/abs\/2204.01691"},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"crossref","unstructured":"Saleema Amershi Daniel\u00a0S. Weld Mihaela Vorvoreanu Adam Fourney Besmira Nushi Penny Collisson Jina Suh Shamsi\u00a0T. Iqbal Paul\u00a0N. Bennett Kori\u00a0Inkpen Quinn Jaime Teevan Ruth Kikin-Gil and Eric Horvitz. 2019. Guidelines for Human-AI Interaction. Proceedings of the 2019 CHI Conference on Human Factors in Computing Systems (2019). https:\/\/api.semanticscholar.org\/CorpusID:86866942","DOI":"10.1145\/3290605.3300233"},{"key":"e_1_3_3_3_7_2","unstructured":"Yuntao Bai Saurav Kadavath Sandipan Kundu Amanda Askell Jackson Kernion Andy Jones Anna Chen Anna Goldie Azalia Mirhoseini Cameron McKinnon et\u00a0al. 2022. Constitutional ai: Harmlessness from ai feedback. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2212.08073 (2022)."},{"key":"e_1_3_3_3_8_2","unstructured":"Tom\u00a0B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel\u00a0M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. arxiv:https:\/\/arXiv.org\/abs\/2005.14165\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2005.14165"},{"key":"e_1_3_3_3_9_2","unstructured":"Tom\u00a0B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel\u00a0M. Ziegler Jeff Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Ma teusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. ArXiv abs\/2005.14165 (2020). https:\/\/api.semanticscholar.org\/CorpusID:218971783"},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_18"},{"key":"e_1_3_3_3_11_2","unstructured":"Andrea Burns Deniz Arsan Sanjna Agrawal Ranjitha Kumar Kate Saenko and Bryan\u00a0A. Plummer. 2022. Interactive Mobile App Navigation with Uncertain or Under-specified Natural Language Commands. ArXiv abs\/2202.02312 (2022). https:\/\/api.semanticscholar.org\/CorpusID:246608249"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"crossref","unstructured":"Tuhin Chakrabarty Vishakh Padmakumar and Hengxing He. 2022. Help me write a Poem - Instruction Tuning as a Vehicle for Collaborative Poetry Writing. ArXiv abs\/2210.13669 (2022). https:\/\/api.semanticscholar.org\/CorpusID:253107865","DOI":"10.18653\/v1\/2022.emnlp-main.460"},{"key":"e_1_3_3_3_13_2","unstructured":"Huiwen Chang Han Zhang Jarred Barber AJ Maschinot Jos\u00e9 Lezama Lu Jiang Ming Yang Kevin\u00a0P. Murphy William\u00a0T. Freeman Michael Rubinstein Yuanzhen Li and Dilip Krishnan. 2023. Muse: Text-To-Image Generation via Masked Generative Transformers. ArXiv abs\/2301.00704 (2023). https:\/\/api.semanticscholar.org\/CorpusID:255372955"},{"key":"e_1_3_3_3_14_2","unstructured":"Hongwei Cui Yuyang Du Qun Yang Yulin Shao and Soung\u00a0Chang Liew. 2023. LLMind: Orchestrating AI and IoT with LLM for Complex Task Execution. https:\/\/api.semanticscholar.org\/CorpusID:266210033"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","unstructured":"Tianyu Cui Yanling Wang Chuanpu Fu Yong Xiao Sijia Li Xinhao Deng Yunpeng Liu Qinglin Zhang Ziyi Qiu Peiyang Li Zhixing Tan Junwu Xiong Xinyu Kong Zujie Wen Ke Xu and Qi Li. 2024. Risk Taxonomy Mitigation and Assessment Benchmarks of Large Language Model Systems. 10.48550\/arXiv.2401.05778arXiv:https:\/\/arXiv.org\/abs\/2401.05778 [cs].","DOI":"10.48550\/arXiv.2401.05778"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.669"},{"key":"e_1_3_3_3_17_2","unstructured":"Wenliang Dai Junnan Li Dongxu Li Anthony Meng\u00a0Huat Tiong Junqi Zhao Weisheng Wang Boyang\u00a0Albert Li Pascale Fung and Steven C.\u00a0H. Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. ArXiv abs\/2305.06500 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258615266"},{"key":"e_1_3_3_3_18_2","unstructured":"Nathalia\u00a0Moraes do Nascimento Paulo S.\u00a0C. Alencar and Donald\u00a0D. Cowan. 2023. GPT-in-the-Loop: Adaptive Decision-Making for Multiagent Systems. ArXiv abs\/2308.10435 (2023). https:\/\/api.semanticscholar.org\/CorpusID:261048710"},{"key":"e_1_3_3_3_19_2","unstructured":"Danny Driess Fei Xia Mehdi S.\u00a0M. Sajjadi Corey Lynch Aakanksha Chowdhery Brian Ichter Ayzaan Wahid Jonathan Tompson Quan Vuong Tianhe Yu Wenlong Huang Yevgen Chebotar Pierre Sermanet Daniel Duckworth Sergey Levine Vincent Vanhoucke Karol Hausman Marc Toussaint Klaus Greff Andy Zeng Igor Mordatch and Pete Florence. 2023. PaLM-E: An Embodied Multimodal Language Model. arxiv:https:\/\/arXiv.org\/abs\/2303.03378\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2303.03378"},{"key":"e_1_3_3_3_20_2","volume-title":"International Conference on Machine Learning","author":"Driess Danny","year":"2023","unstructured":"Danny Driess, F. Xia, Mehdi S.\u00a0M. Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan\u00a0Ho Vuong, Tianhe Yu, Wenlong Huang, Yevgen Chebotar, Pierre Sermanet, Daniel Duckworth, Sergey Levine, Vincent Vanhoucke, Karol Hausman, Marc Toussaint, Klaus Greff, Andy Zeng, Igor Mordatch, and Peter\u00a0R. Florence. 2023. PaLM-E: An Embodied Multimodal Language Model. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:257364842"},{"key":"e_1_3_3_3_21_2","unstructured":"Rohan\u00a0Anil et al.2023. PaLM 2 Technical Report. ArXiv abs\/2305.10403 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258740735"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"publisher","unstructured":"Marc Eulerich Nathan Waddoups Martin Wagener and David\u00a0A. Wood. 2024. The Dark Side of Robotic Process Automation (RPA): Understanding Risks and Challenges with RPA. Accounting Horizons 38 2 (June 2024) 143\u2013152. 10.2308\/HORIZONS-2022-019","DOI":"10.2308\/HORIZONS-2022-019"},{"key":"e_1_3_3_3_23_2","unstructured":"Rong-En Fan and Chih-Jen Lin. 2007. A study on threshold selection for multi-label classification. Department of Computer Science National Taiwan University (2007) 1\u201323."},{"key":"e_1_3_3_3_24_2","unstructured":"Paul\u00a0M Fitts. 1951. Human engineering for an effective air-navigation and traffic-control system. (1951)."},{"key":"e_1_3_3_3_25_2","unstructured":"Amelia Glaese Nat McAleese Maja Trkebacz John Aslanides Vlad Firoiu Timo Ewalds Maribeth Rauh Laura Weidinger Martin Chadwick Phoebe Thacker Lucy Campbell-Gillingham Jonathan Uesato Po-Sen Huang Ramona Comanescu Fan Yang A. See Sumanth Dathathri Rory Greig Charlie Chen Doug Fritz Jaume\u00a0Sanchez Elias Richard Green Sovna Mokr\u2019a Nicholas Fernando Boxi Wu Rachel Foley Susannah Young Iason Gabriel William\u00a0S. Isaac John F.\u00a0J. Mellor Demis Hassabis Koray Kavukcuoglu Lisa\u00a0Anne Hendricks and Geoffrey Irving. 2022. Improving alignment of dialogue agents via targeted human judgements. ArXiv abs\/2209.14375 (2022). https:\/\/api.semanticscholar.org\/CorpusID:252596089"},{"key":"e_1_3_3_3_26_2","unstructured":"Albert Gu and Tri Dao. 2023. Mamba: Linear-Time Sequence Modeling with Selective State Spaces. ArXiv abs\/2312.00752 (2023). https:\/\/api.semanticscholar.org\/CorpusID:265551773"},{"key":"e_1_3_3_3_27_2","unstructured":"Izzeddin Gur Hiroki Furuta Austin Huang Mustafa Safdari Yutaka Matsuo Douglas Eck and Aleksandra Faust. 2023. A Real-World WebAgent with Planning Long Context Understanding and Program Synthesis. ArXiv abs\/2307.12856 (2023). https:\/\/api.semanticscholar.org\/CorpusID:260126067"},{"key":"e_1_3_3_3_28_2","unstructured":"Izzeddin Gur Ulrich R\u00fcckert Aleksandra Faust and Dilek\u00a0Z. Hakkani-T\u00fcr. 2018. Learning to Navigate the Web. ArXiv abs\/1812.09195 (2018). https:\/\/api.semanticscholar.org\/CorpusID:56657805"},{"key":"e_1_3_3_3_29_2","volume-title":"LWA","author":"Hartmann Melanie","year":"2009","unstructured":"Melanie Hartmann. 2009. Challenges in Developing User-Adaptive Intelligent User Interfaces. In LWA. https:\/\/api.semanticscholar.org\/CorpusID:9977854"},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"crossref","unstructured":"Bright Hong Michael Ly and Hui Lin. 2023. Robotic process automation risk management: Points to consider. Journal of emerging technologies in accounting 20 1 (2023) 125\u2013145.","DOI":"10.2308\/JETA-2022-004"},{"key":"e_1_3_3_3_31_2","unstructured":"Sirui Hong Xiawu Zheng Jonathan\u00a0P. Chen Yuheng Cheng Ceyao Zhang Zili Wang Steven Ka\u00a0Shing Yau Zi\u00a0Hen Lin Liyang Zhou Chenyu Ran Lingfeng Xiao and Chenglin Wu. 2023. MetaGPT: Meta Programming for Multi-Agent Collaborative Framework. ArXiv abs\/2308.00352 (2023). https:\/\/api.semanticscholar.org\/CorpusID:260351380"},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/302979.303030"},{"key":"e_1_3_3_3_33_2","unstructured":"Wenyue Hua Xianjun Yang Zelong Li Cheng Wei and Yongfeng Zhang. 2024. TrustAgent: Towards Safe and Trustworthy LLM-based Agents through Agent Constitution. ArXiv abs\/2402.01586 (2024). https:\/\/api.semanticscholar.org\/CorpusID:267406347"},{"key":"e_1_3_3_3_34_2","unstructured":"Shaohan Huang Li Dong Wenhui Wang Yaru Hao Saksham Singhal Shuming Ma Tengchao Lv Lei Cui Owais\u00a0Khan Mohammed Qiang Liu Kriti Aggarwal Zewen Chi Johan Bjorck Vishrav Chaudhary Subhojit Som Xia Song and Furu Wei. 2023. Language Is Not All You Need: Aligning Perception with Language Models. ArXiv abs\/2302.14045 (2023). https:\/\/api.semanticscholar.org\/CorpusID:257219775"},{"key":"e_1_3_3_3_35_2","unstructured":"Wenlong Huang P. Abbeel Deepak Pathak and Igor Mordatch. 2022. Language Models as Zero-Shot Planners: Extracting Actionable Knowledge for Embodied Agents. ArXiv abs\/2201.07207 (2022). https:\/\/api.semanticscholar.org\/CorpusID:246035276"},{"key":"e_1_3_3_3_36_2","unstructured":"Sheikh\u00a0Asif Imran Mohammad Nur\u00a0Hossain Khan Subrata Biswas and Bashima Islam. 2024. LLaSA: Large Multimodal Agent for Human Activity Analysis Through Wearable Sensors. ArXiv abs\/2406.14498 (2024). https:\/\/api.semanticscholar.org\/CorpusID:270620504"},{"key":"e_1_3_3_3_37_2","unstructured":"Albert\u00a0Qiaochu Jiang Alexandre Sablayrolles Arthur Mensch Chris Bamford Devendra\u00a0Singh Chaplot Diego de Las\u00a0Casas Florian Bressand Gianna Lengyel Guillaume Lample Lucile Saulnier L\u2019elio\u00a0Renard Lavaud Marie-Anne Lachaux Pierre Stock Teven\u00a0Le Scao Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William\u00a0El Sayed. 2023. Mistral 7B. ArXiv abs\/2310.06825 (2023). https:\/\/api.semanticscholar.org\/CorpusID:263830494"},{"key":"e_1_3_3_3_38_2","unstructured":"Yue Jiang Eldon Schoop Amanda Swearngin and Jeffrey Nichols. 2023. ILuvUI: Instruction-tuned LangUage-Vision modeling of UIs from Machine Conversations. ArXiv abs\/2310.04869 (2023). https:\/\/api.semanticscholar.org\/CorpusID:263830178"},{"key":"e_1_3_3_3_39_2","unstructured":"D. Kondratyuk Lijun Yu Xiuye Gu Jos\u00e9 Lezama Jonathan Huang Rachel Hornung Hartwig Adam Hassan Akbari Yair Alon Vighnesh Birodkar Yong Cheng Ming-Chang Chiu Josh Dillon Irfan Essa Agrim Gupta Meera Hahn Anja Hauth David Hendon Alonso Martinez David\u00a0C. Minnen David\u00a0A. Ross Grant Schindler Mikhail Sirotenko Kihyuk Sohn Krishna Somandepalli Huisheng Wang Jimmy Yan Ming Yang Xuan Yang Bryan Seybold and Lu Jiang. 2023. VideoPoet: A Large Language Model for Zero-Shot Video Generation. ArXiv abs\/2312.14125 (2023). https:\/\/api.semanticscholar.org\/CorpusID:266435847"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"crossref","unstructured":"Mina Lee Percy Liang and Qian Yang. 2022. CoAuthor: Designing a Human-AI Collaborative Writing Dataset for Exploring Language Model Capabilities. Proceedings of the 2022 CHI Conference on Human Factors in Computing Systems (2022). https:\/\/api.semanticscholar.org\/CorpusID:246016439","DOI":"10.1145\/3491102.3502030"},{"key":"e_1_3_3_3_41_2","unstructured":"Bo Li Yuanhan Zhang Liangyu Chen Jinghao Wang Jingkang Yang and Ziwei Liu. 2023. Otter: A Multi-Modal Model with In-Context Instruction Tuning. ArXiv abs\/2305.03726 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258547300"},{"key":"e_1_3_3_3_42_2","unstructured":"Chunyuan Li Zhe Gan Zhengyuan Yang Jianwei Yang Linjie Li Lijuan Wang and Jianfeng Gao. 2023. Multimodal Foundation Models: From Specialists to General-Purpose Assistants. Found. Trends Comput. Graph. Vis. 16 (2023) 1\u2013214. https:\/\/api.semanticscholar.org\/CorpusID:262055614"},{"key":"e_1_3_3_3_43_2","unstructured":"Gang Li and Yang Li. 2022. Spotlight: Mobile UI Understanding using Vision-Language Models with a Focus. ArXiv abs\/2209.14927 (2022). https:\/\/api.semanticscholar.org\/CorpusID:252595735"},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","unstructured":"Tao Li Gang Li Zhiwei Deng Bryan Wang and Yang Li. 2023. A Zero-Shot Language Agent for Computer Control with Structured Reflection. (2023). 10.48550\/ARXIV.2310.08740Publisher: arXiv Version Number: 3.","DOI":"10.48550\/ARXIV.2310.08740"},{"key":"e_1_3_3_3_45_2","unstructured":"Wei Li Will Bishop Alice Li Christopher Rawles Folawiyo Campbell-Ajala Divya Tyamagundlu and Oriana Riva. 2024. On the Effects of Data Scale on Computer Control Agents. ArXiv abs\/2406.03679 (2024). https:\/\/api.semanticscholar.org\/CorpusID:270285816"},{"key":"e_1_3_3_3_46_2","unstructured":"Yang Li Jiacong He Xiaoxia Zhou Yuan Zhang and Jason Baldridge. 2020. Mapping Natural Language Instructions to Mobile UI Action Sequences. ArXiv abs\/2005.03776 (2020). https:\/\/api.semanticscholar.org\/CorpusID:218571167"},{"key":"e_1_3_3_3_47_2","unstructured":"Han Lin Abhaysinh Zala Jaemin Cho and Mohit Bansal. 2023. VideoDirectorGPT: Consistent Multi-scene Video Generation via LLM-Guided Planning. ArXiv abs\/2309.15091 (2023). https:\/\/api.semanticscholar.org\/CorpusID:262825203"},{"key":"e_1_3_3_3_48_2","unstructured":"Evan\u00a0Zheran Liu Kelvin Guu Panupong Pasupat Tianlin Shi and Percy Liang. 2018. Reinforcement Learning on Web Interfaces Using Workflow-Guided Exploration. ArXiv abs\/1802.08802 (2018). https:\/\/api.semanticscholar.org\/CorpusID:3530344"},{"key":"e_1_3_3_3_49_2","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2023. Visual Instruction Tuning. ArXiv abs\/2304.08485 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258179774"},{"key":"e_1_3_3_3_50_2","unstructured":"Xiao Liu Hao Yu Hanchen Zhang Yifan Xu Xuanyu Lei Hanyu Lai Yu Gu Yuxian Gu Hangliang Ding Kai Men Kejuan Yang Shudan Zhang Xiang Deng Aohan Zeng Zhengxiao Du Chenhui Zhang Shengqi Shen Sheng Shen Yu Su Huan Sun Minlie Huang Yuxiao Dong and Jie Tang. 2023. AgentBench: Evaluating LLMs as Agents. ArXiv abs\/2308.03688 (2023). https:\/\/api.semanticscholar.org\/CorpusID:260682249"},{"key":"e_1_3_3_3_51_2","unstructured":"Zhiwei Liu Weiran Yao Jianguo Zhang Le Xue Shelby Heinecke Rithesh Murthy Yihao Feng Zeyuan Chen Juan\u00a0Carlos Niebles Devansh Arpit et\u00a0al. 2023. Bolaa: Benchmarking and orchestrating llm-augmented autonomous agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.05960 (2023)."},{"key":"e_1_3_3_3_52_2","unstructured":"Zijun Liu Yanzhe Zhang Peng Li Yang Liu and Diyi Yang. 2023. Dynamic LLM-Agent Network: An LLM-agent Collaboration Framework with Agent Team Optimization. ArXiv abs\/2310.02170 (2023). https:\/\/api.semanticscholar.org\/CorpusID:263608687"},{"key":"e_1_3_3_3_53_2","unstructured":"Pan Lu Baolin Peng Hao Cheng Michel Galley Kai-Wei Chang Ying\u00a0Nian Wu Song-Chun Zhu and Jianfeng Gao. 2023. Chameleon: Plug-and-Play Compositional Reasoning with Large Language Models. ArXiv abs\/2304.09842 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258212542"},{"key":"e_1_3_3_3_54_2","doi-asserted-by":"crossref","unstructured":"Brandon McKinzie Zhe Gan Jean-Philippe Fauconnier Sam Dodge Bowen Zhang Philipp Dufter Dhruti Shah Xianzhi Du Futang Peng Floris Weers Anton Belyi Haotian Zhang Karanjeet Singh Doug Kang Ankur Jain Hongyu He Max Schwarzer Tom Gunter Xiang Kong Aonan Zhang Jianyu Wang Chong Wang Nan Du Tao Lei Sam Wiseman Guoli Yin Mark Lee Zirui Wang Ruoming Pang Peter Grasch Alexander Toshev and Yinfei Yang. 2024. MM1: Methods Analysis & Insights from Multimodal LLM Pre-training. ArXiv abs\/2403.09611 (2024). https:\/\/api.semanticscholar.org\/CorpusID:268384865","DOI":"10.1007\/978-3-031-73397-0_18"},{"key":"e_1_3_3_3_55_2","doi-asserted-by":"crossref","unstructured":"Sewon Min Xinxi Lyu Ari Holtzman Mikel Artetxe Mike Lewis Hannaneh Hajishirzi and Luke Zettlemoyer. 2022. Rethinking the Role of Demonstrations: What Makes In-Context Learning Work? ArXiv abs\/2202.12837 (2022). https:\/\/api.semanticscholar.org\/CorpusID:247155069","DOI":"10.18653\/v1\/2022.emnlp-main.759"},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/3533406.3533407"},{"key":"e_1_3_3_3_57_2","unstructured":"OpenAI. 2023. GPT-4 Technical Report. https:\/\/api.semanticscholar.org\/CorpusID:257532815"},{"key":"e_1_3_3_3_58_2","doi-asserted-by":"publisher","unstructured":"R. Parasuraman T.B. Sheridan and C.D. Wickens. 2000. A model for types and levels of human interaction with automation. IEEE Transactions on Systems Man and Cybernetics - Part A: Systems and Humans 30 3 (May 2000) 286\u2013297. 10.1109\/3468.844354Conference Name: IEEE Transactions on Systems Man and Cybernetics - Part A: Systems and Humans.","DOI":"10.1109\/3468.844354"},{"key":"e_1_3_3_3_59_2","unstructured":"Joon\u00a0Sung Park Joseph\u00a0C. O\u2019Brien Carrie\u00a0J. Cai Meredith\u00a0Ringel Morris Percy Liang and Michael\u00a0S. Bernstein. 2023. Generative Agents: Interactive Simulacra of Human Behavior. Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology (2023). https:\/\/api.semanticscholar.org\/CorpusID:258040990"},{"key":"e_1_3_3_3_60_2","unstructured":"Jingqing Ruan Yihong Chen Bin Zhang Zhiwei Xu Tianpeng Bao Guoqing Du Shiwei Shi Hangyu Mao Xingyu Zeng and Rui Zhao. 2023. TPTU: Large Language Model-based AI Agents for Task Planning and Tool Usage. https:\/\/api.semanticscholar.org\/CorpusID:260681466"},{"key":"e_1_3_3_3_61_2","unstructured":"Yangjun Ruan Honghua Dong Andrew Wang Silviu Pitis Yongchao Zhou Jimmy Ba Yann Dubois Chris\u00a0J. Maddison and Tatsunori Hashimoto. 2023. Identifying the Risks of LM Agents with an LM-Emulated Sandbox. ArXiv abs\/2309.15817 (2023). https:\/\/api.semanticscholar.org\/CorpusID:262944419"},{"key":"e_1_3_3_3_62_2","doi-asserted-by":"crossref","unstructured":"T Sheridan. 1978. Human and computer control of undersea teleoperators. Man-Machine Systems Laboratory Report (1978).","DOI":"10.21236\/ADA057655"},{"key":"e_1_3_3_3_63_2","volume-title":"International Conference on Machine Learning","author":"Shi Tianlin","year":"2017","unstructured":"Tianlin Shi, Andrej Karpathy, Linxi\u00a0(Jim) Fan, Josefa\u00a0Z. Hern\u00e1ndez, and Percy Liang. 2017. World of Bits: An Open-Domain Platform for Web-Based Agents. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:34953552"},{"key":"e_1_3_3_3_64_2","doi-asserted-by":"publisher","unstructured":"Ben Shneiderman. 2020. Human-Centered Artificial Intelligence: Reliable Safe & Trustworthy. International Journal of Human\u2013Computer Interaction 36 6 (April 2020) 495\u2013504. 10.1080\/10447318.2020.1741118Publisher: Taylor & Francis _eprint: https:\/\/doi.org\/10.1080\/10447318.2020.1741118.","DOI":"10.1080\/10447318.2020.1741118"},{"key":"e_1_3_3_3_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676386"},{"key":"e_1_3_3_3_66_2","unstructured":"Lichao Sun Yue Huang Haoran Wang Siyuan Wu Qihui Zhang Chujie Gao Yixin Huang Wenhan Lyu Yixuan Zhang Xiner Li Zheng Liu Yixin Liu Yijue Wang Zhikun Zhang Bhavya Kailkhura Caiming Xiong Chaowei Xiao Chun-Yan Li Eric\u00a0P. Xing Furong Huang Haodong Liu Heng Ji Hongyi Wang Huan Zhang Huaxiu Yao Manolis Kellis Marinka Zitnik Meng Jiang Mohit Bansal James Zou Jian Pei Jian Liu Jianfeng Gao Jiawei Han Jieyu Zhao Jiliang Tang Jindong Wang John Mitchell Kai Shu Kaidi Xu Kai-Wei Chang Lifang He Lifu Huang Michael Backes Neil\u00a0Zhenqiang Gong Philip\u00a0S. Yu Pin-Yu Chen Quanquan Gu Ran Xu Rex Ying Shuiwang Ji Suman\u00a0Sekhar Jana Tian-Xiang Chen Tianming Liu Tianying Zhou William Wang Xiang Li Xiang-Yu Zhang Xiao Wang Xingyao Xie Xun Chen Xuyu Wang Yan Liu Yanfang Ye Yinzhi Cao and Yue Zhao. 2024. TrustLLM: Trustworthiness in Large Language Models. ArXiv abs\/2401.05561 (2024). https:\/\/api.semanticscholar.org\/CorpusID:266933236"},{"key":"e_1_3_3_3_67_2","unstructured":"Quan Sun Qiying Yu Yufeng Cui Fan Zhang Xiaosong Zhang Yueze Wang Hongcheng Gao Jingjing Liu Tiejun Huang and Xinlong Wang. 2023. Generative Pretraining in Multimodality. ArXiv abs\/2307.05222 (2023). https:\/\/api.semanticscholar.org\/CorpusID:259765944"},{"key":"e_1_3_3_3_68_2","doi-asserted-by":"crossref","unstructured":"Amanda Swearngin Jason Wu Xiaoyi Zhang Esteban Gomez Jen Coughenour Rachel Stukenborg Bhavya Garg Greg Hughes Adriana Hilliard Jeffrey\u00a0P Bigham and Jeffrey Nichols. 2024. Towards Automated Accessibility Report Generation for Mobile Apps. ACM Transactions on Computer-Human Interaction (2024).","DOI":"10.1145\/3674967"},{"key":"e_1_3_3_3_69_2","doi-asserted-by":"crossref","unstructured":"Maryam Taeb Amanda Swearngin Eldon Schoop Ruijia Cheng Yue Jiang and Jeffrey Nichols. 2023. AXNav: Replaying Accessibility Tests from Natural Language. Proceedings of the CHI Conference on Human Factors in Computing Systems (2023). https:\/\/api.semanticscholar.org\/CorpusID:264148114","DOI":"10.1145\/3613904.3642777"},{"key":"e_1_3_3_3_70_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. ArXiv abs\/2302.13971 (2023). https:\/\/api.semanticscholar.org\/CorpusID:257219404"},{"key":"e_1_3_3_3_71_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676356"},{"key":"e_1_3_3_3_72_2","doi-asserted-by":"crossref","unstructured":"Bryan Wang Gang Li and Yang Li. 2022. Enabling Conversational Interaction with Mobile UI using Large Language Models. Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems (2022). https:\/\/api.semanticscholar.org\/CorpusID:252367445","DOI":"10.1145\/3544548.3580895"},{"key":"e_1_3_3_3_73_2","doi-asserted-by":"publisher","DOI":"10.1145\/3472749.3474765"},{"key":"e_1_3_3_3_74_2","unstructured":"Guanzhi Wang Yuqi Xie Yunfan Jiang Ajay Mandlekar Chaowei Xiao Yuke Zhu Linxi\u00a0(Jim) Fan and Anima Anandkumar. 2023. Voyager: An Open-Ended Embodied Agent with Large Language Models. Trans. Mach. Learn. Res. 2024 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258887849"},{"key":"e_1_3_3_3_75_2","unstructured":"Xingyao Wang Yangyi Chen Lifan Yuan Yizhe Zhang Yunzhu Li Hao Peng and Heng Ji. 2024. Executable Code Actions Elicit Better LLM Agents. ArXiv abs\/2402.01030 (2024). https:\/\/api.semanticscholar.org\/CorpusID:267406155"},{"key":"e_1_3_3_3_76_2","doi-asserted-by":"crossref","unstructured":"Zhilin Wang Yu\u00a0Ying Chiu and Yu\u00a0Cheung Chiu. 2023. Humanoid Agents: Platform for Simulating Human-like Generative Agents. ArXiv abs\/2310.05418 (2023). https:\/\/api.semanticscholar.org\/CorpusID:263830637","DOI":"10.18653\/v1\/2023.emnlp-demo.15"},{"key":"e_1_3_3_3_77_2","unstructured":"Jason Wei Xuezhi Wang Dale Schuurmans Maarten Bosma Brian Ichter Fei Xia Ed Chi Quoc Le and Denny Zhou. 2023. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2201.11903\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2201.11903"},{"key":"e_1_3_3_3_78_2","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533088"},{"key":"e_1_3_3_3_79_2","doi-asserted-by":"crossref","unstructured":"Hao Wen Yuanchun Li Guohong Liu Shanhui Zhao Tao Yu Toby Jia-Jun Li Shiqi Jiang Yunhao Liu Yaqin Zhang and Yunxin Liu. 2023. AutoDroid: LLM-powered Task Automation in Android. Proceedings of the 30th Annual International Conference on Mobile Computing and Networking (2023). https:\/\/api.semanticscholar.org\/CorpusID:261277501","DOI":"10.1145\/3636534.3649379"},{"key":"e_1_3_3_3_80_2","unstructured":"Hao Wen Yuanchun Li Guohong Liu Shanhui Zhao Tao Yu Toby Jia-Jun Li Shiqi Jiang Yunhao Liu Yaqin Zhang and Yunxin Liu. 2023. Empowering LLM to use Smartphone for Intelligent Task Automation. ArXiv abs\/2308.15272 (2023). https:\/\/api.semanticscholar.org\/CorpusID:268890279"},{"key":"e_1_3_3_3_81_2","unstructured":"Jason Wu Eldon Schoop Alan Leung Titus Barik Jeffrey\u00a0P. Bigham and Jeffrey Nichols. 2024. UICoder: Finetuning Large Language Models to Generate User Interface Code through Automated Feedback. ArXiv abs\/2406.07739 (2024). https:\/\/api.semanticscholar.org\/CorpusID:270391741"},{"key":"e_1_3_3_3_82_2","unstructured":"Qingyun Wu Gagan Bansal Jieyu Zhang Yiran Wu Shaokun Zhang Erkang Zhu Beibin Li Li Jiang Xiaoyun Zhang and Chi Wang. 2023. AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework. ArXiv abs\/2308.08155 (2023). https:\/\/api.semanticscholar.org\/CorpusID:260925901"},{"key":"e_1_3_3_3_83_2","unstructured":"Hengjia Xiao and Peng Wang. 2023. LLM A*: Human in the Loop Large Language Models Enabled A* Search for Robotics. ArXiv abs\/2312.01797 (2023). https:\/\/api.semanticscholar.org\/CorpusID:265609154"},{"key":"e_1_3_3_3_84_2","doi-asserted-by":"crossref","unstructured":"Jianing Yang Xuweiyi Chen Shengyi Qian Nikhil Madaan Madhavan Iyengar David\u00a0F. Fouhey and Joyce Chai. 2023. LLM-Grounder: Open-Vocabulary 3D Visual Grounding with Large Language Model as an Agent. 2024 IEEE International Conference on Robotics and Automation (ICRA) (2023) 7694\u20137701. https:\/\/api.semanticscholar.org\/CorpusID:262084072","DOI":"10.1109\/ICRA57147.2024.10610443"},{"key":"e_1_3_3_3_85_2","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yi Zhou Junyan Wang Anwen Hu Pengcheng Shi Yaya Shi Chenliang Li Yuanhong Xu Hehong Chen Junfeng Tian Qiang Qi Ji Zhang and Feiyan Huang. 2023. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. ArXiv abs\/2304.14178 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258352455"},{"key":"e_1_3_3_3_86_2","unstructured":"Haoxuan You Haotian Zhang Zhe Gan Xianzhi Du Bowen Zhang Zirui Wang Liangliang Cao Shih-Fu Chang and Yinfei Yang. 2023. Ferret: Refer and Ground Anything Anywhere at Any Granularity. ArXiv abs\/2310.07704 (2023). https:\/\/api.semanticscholar.org\/CorpusID:263834718"},{"key":"e_1_3_3_3_87_2","doi-asserted-by":"publisher","unstructured":"Chaoyun Zhang Liqun Li Shilin He Xu Zhang Bo Qiao Si Qin Minghua Ma Yu Kang Qingwei Lin Saravan Rajmohan Dongmei Zhang and Qi Zhang. 2024. UFO: A UI-Focused Agent for Windows OS Interaction. 10.48550\/arXiv.2402.07939arXiv:https:\/\/arXiv.org\/abs\/2402.07939 [cs].","DOI":"10.48550\/arXiv.2402.07939"},{"key":"e_1_3_3_3_88_2","unstructured":"Haotian Zhang Mingfei Gao Zhe Gan Philipp Dufter Nina Wenzel Forrest Huang Dhruti Shah Xianzhi Du Bowen Zhang Yanghao Li et\u00a0al. 2024. MM1. 5: Methods Analysis & Insights from Multimodal LLM Fine-tuning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.20566 (2024)."},{"key":"e_1_3_3_3_89_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676382"},{"key":"e_1_3_3_3_90_2","doi-asserted-by":"crossref","unstructured":"Xiaoyi Zhang Lilian de Greef Amanda Swearngin Samuel White Kyle\u00a0I. Murray Lisa Yu Qi Shan Jeffrey Nichols Jason Wu Chris Fleizach Aaron Everitt and Jeffrey\u00a0P. Bigham. 2021. Screen Recognition: Creating Accessibility Metadata for Mobile Applications from Pixels. Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems (2021). https:\/\/api.semanticscholar.org\/CorpusID:231592643","DOI":"10.1145\/3411764.3445186"},{"key":"e_1_3_3_3_91_2","unstructured":"Zhizheng Zhang Xiaoyi Zhang Wenxuan Xie and Yan Lu. 2023. Responsible Task Automation: Empowering Large Language Models as Responsible Task Automators. ArXiv abs\/2306.01242 (2023). https:\/\/api.semanticscholar.org\/CorpusID:259063857"},{"key":"e_1_3_3_3_92_2","unstructured":"Shuyan Zhou Frank\u00a0F. Xu Hao Zhu Xuhui Zhou Robert Lo Abishek Sridhar Xianyi Cheng Yonatan Bisk Daniel Fried Uri Alon and Graham Neubig. 2023. WebArena: A Realistic Web Environment for Building Autonomous Agents. ArXiv abs\/2307.13854 (2023). https:\/\/api.semanticscholar.org\/CorpusID:260164780"},{"key":"e_1_3_3_3_93_2","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. ArXiv abs\/2304.10592 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258291930"}],"event":{"name":"IUI '25: 30th International Conference on Intelligent User Interfaces","location":"Cagliari Italy","acronym":"IUI '25","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 30th International Conference on Intelligent User Interfaces"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708359.3712153","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3708359.3712153","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:57:06Z","timestamp":1750298226000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708359.3712153"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,24]]},"references-count":92,"alternative-id":["10.1145\/3708359.3712153","10.1145\/3708359"],"URL":"https:\/\/doi.org\/10.1145\/3708359.3712153","relation":{},"subject":[],"published":{"date-parts":[[2025,3,24]]},"assertion":[{"value":"2025-03-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}