{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T01:34:34Z","timestamp":1780623274145,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T00:00:00Z","timestamp":1728604800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,13]]},"DOI":"10.1145\/3654777.3676382","type":"proceedings-article","created":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T10:50:36Z","timestamp":1728643836000},"page":"1-13","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["LlamaTouch: A Faithful and Scalable Testbed for Mobile UI Task Automation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0779-8310","authenticated-orcid":false,"given":"Li","family":"Zhang","sequence":"first","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6883-2036","authenticated-orcid":false,"given":"Shihe","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7309-2936","authenticated-orcid":false,"given":"Xianqing","family":"Jia","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8896-4222","authenticated-orcid":false,"given":"Zhihan","family":"Zheng","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1798-9826","authenticated-orcid":false,"given":"Yunhe","family":"Yan","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9762-1160","authenticated-orcid":false,"given":"Longxi","family":"Gao","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1591-2526","authenticated-orcid":false,"given":"Yuanchun","family":"Li","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6271-6993","authenticated-orcid":false,"given":"Mengwei","family":"Xu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2017. XML Path Language (XPath) 3.1. https:\/\/www.w3.org\/TR\/xpath-31\/."},{"key":"e_1_3_2_1_2_1","unstructured":"2018. Single activity: Why when and how (Android Dev Summit \u201918). https:\/\/www.youtube.com\/watch?v=2k8x8V77CrU."},{"key":"e_1_3_2_1_3_1","unstructured":"2024. Activity | Android Developers. https:\/\/developer.android.com\/reference\/android\/app\/Activity."},{"key":"e_1_3_2_1_4_1","unstructured":"2024. Android UIAutomator2. https:\/\/github.com\/appium\/appium-uiautomator2-driver."},{"key":"e_1_3_2_1_5_1","volume-title":"Gpt-4 technical report. arXiv preprint arXiv:2303.08774","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia\u00a0Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_6_1","unstructured":"Apple. 2024. Siri - Apple. https:\/\/www.apple.com\/siri\/."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445762"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_18"},{"key":"e_1_3_2_1_9_1","volume-title":"Can Large Language Models Be an Alternative to Human Evaluations?arXiv preprint arXiv:2305.01937","author":"Chiang Cheng-Han","year":"2023","unstructured":"Cheng-Han Chiang and Hung-yi Lee. 2023. Can Large Language Models Be an Alternative to Human Evaluations?arXiv preprint arXiv:2305.01937 (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126594.3126651"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3490099.3511109"},{"key":"e_1_3_2_1_12_1","unstructured":"Google. 2023. Run apps on the Android Emulator | Android Developers. https:\/\/developer.android.com\/studio\/run\/emulator."},{"key":"e_1_3_2_1_13_1","unstructured":"Google. 2024. Build web apps in WebView. https:\/\/developer.android.com\/develop\/ui\/views\/layout\/webapps\/webview."},{"key":"e_1_3_2_1_14_1","unstructured":"Google. 2024. Google Assistant your own personal Google. https:\/\/www.apple.com\/siri\/."},{"key":"e_1_3_2_1_15_1","volume-title":"CogAgent: A Visual Language Model for GUI Agents. arXiv preprint arXiv:2312.08914","author":"Hong Wenyi","year":"2023","unstructured":"Wenyi Hong, Weihan Wang, Qingsong Lv, Jiazheng Xu, Wenmeng Yu, Junhui Ji, Yan Wang, Zihan Wang, Yuxiao Dong, Ming Ding, 2023. CogAgent: A Visual Language Model for GUI Agents. arXiv preprint arXiv:2312.08914 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Augmenting LLM with Human-like Memory for Mobile Task Automation. arXiv preprint arXiv:2312.03003","author":"Lee Sunjae","year":"2023","unstructured":"Sunjae Lee, Junyoung Choi, Jungjae Lee, Hojun Choi, Steven\u00a0Y Ko, Sangeun Oh, and Insik Shin. 2023. Explore, Select, Derive, and Recall: Augmenting LLM with Human-like Memory for Mobile Task Automation. arXiv preprint arXiv:2312.03003 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"Mapping natural language instructions to mobile UI action sequences. arXiv preprint arXiv:2005.03776","author":"Li Yang","year":"2020","unstructured":"Yang Li, Jiacong He, Xin Zhou, Yuan Zhang, and Jason Baldridge. 2020. Mapping natural language instructions to mobile UI action sequences. arXiv preprint arXiv:2005.03776 (2020)."},{"key":"e_1_3_2_1_19_1","volume-title":"Personal LLM Agents: Insights and Survey about the Capability, Efficiency and Security. arXiv preprint arXiv:2401.05459","author":"Li Yuanchun","year":"2024","unstructured":"Yuanchun Li, Hao Wen, Weijun Wang, Xiangyu Li, Yizhen Yuan, Guohong Liu, Jiacheng Liu, Wenxing Xu, Xiang Wang, Yi Sun, 2024. Personal LLM Agents: Insights and Survey about the Capability, Efficiency and Security. arXiv preprint arXiv:2401.05459 (2024)."},{"key":"e_1_3_2_1_20_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2023. Visual Instruction Tuning. arxiv:2304.08485\u00a0[cs.CV]"},{"key":"e_1_3_2_1_21_1","volume-title":"Comprehensive Cognitive LLM Agent for Smartphone GUI Automation. arXiv preprint arXiv:2402.11941","author":"Ma Xinbei","year":"2024","unstructured":"Xinbei Ma, Zhuosheng Zhang, and Hai Zhao. 2024. Comprehensive Cognitive LLM Agent for Smartphone GUI Automation. arXiv preprint arXiv:2402.11941 (2024)."},{"key":"e_1_3_2_1_22_1","unstructured":"OpenAI. 2023. GPT-4V(ision) system card. https:\/\/openai.com\/research\/gpt-4v-system-card."},{"key":"e_1_3_2_1_23_1","volume-title":"AutoTask: Executing Arbitrary Voice Commands by Exploring and Learning from Mobile GUI. arXiv preprint arXiv:2312.16062","author":"Pan Lihang","year":"2023","unstructured":"Lihang Pan, Bowen Wang, Chun Yu, Yuxuan Chen, Xiangyu Zhang, and Yuanchun Shi. 2023. AutoTask: Executing Arbitrary Voice Commands by Exploring and Learning from Mobile GUI. arXiv preprint arXiv:2312.16062 (2023)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_1_25_1","volume-title":"Android in the wild: A large-scale dataset for android device control. arXiv preprint arXiv:2307.10088","author":"Rawles Christopher","year":"2023","unstructured":"Christopher Rawles, Alice Li, Daniel Rodriguez, Oriana Riva, and Timothy Lillicrap. 2023. Android in the wild: A large-scale dataset for android device control. arXiv preprint arXiv:2307.10088 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"META-GUI: Towards Multi-modal Conversational Agents on Mobile GUI. arXiv preprint arXiv:2205.11029","author":"Sun Liangtai","year":"2022","unstructured":"Liangtai Sun, Xingyu Chen, Lu Chen, Tianle Dai, Zichen Zhu, and Kai Yu. 2022. META-GUI: Towards Multi-modal Conversational Agents on Mobile GUI. arXiv preprint arXiv:2205.11029 (2022)."},{"key":"e_1_3_2_1_27_1","volume-title":"AXNav: Replaying Accessibility Tests from Natural Language. arXiv preprint arXiv:2310.02424","author":"Taeb Maryam","year":"2023","unstructured":"Maryam Taeb, Amanda Swearngin, Eldon School, Ruijia Cheng, Yue Jiang, and Jeffrey Nichols. 2023. AXNav: Replaying Accessibility Tests from Natural Language. arXiv preprint arXiv:2310.02424 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Androidenv: A reinforcement learning platform for android. arXiv preprint arXiv:2105.13231","author":"Toyama Daniel","year":"2021","unstructured":"Daniel Toyama, Philippe Hamel, Anita Gergely, Gheorghe Comanici, Amelia Glaese, Zafarali Ahmed, Tyler Jackson, Shibl Mourad, and Doina Precup. 2021. Androidenv: A reinforcement learning platform for android. arXiv preprint arXiv:2105.13231 (2021)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580895"},{"key":"e_1_3_2_1_30_1","volume-title":"Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu.","author":"Wen Hao","year":"2024","unstructured":"Hao Wen, Yuanchun Li, Guohong Liu, Shanhui Zhao, Tao Yu, Toby Jia-Jun Li, Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu. 2024. Autodroid: Llm-powered task automation in android. (2024), 543\u2013557."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606824"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581158"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3368089.3417940"},{"key":"e_1_3_2_1_34_1","volume-title":"Understanding the Weakness of Large Language Model Agents within a Complex Android Environment. arXiv preprint arXiv:2402.06596","author":"Xing Mingzhe","year":"2024","unstructured":"Mingzhe Xing, Rongkai Zhang, Hui Xue, Qi Chen, Fan Yang, and Zhen Xiao. 2024. Understanding the Weakness of Large Language Model Agents within a Complex Android Environment. arXiv preprint arXiv:2402.06596 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"GPT-4V in Wonderland: Large Multimodal Models for Zero-Shot Smartphone GUI Navigation. arXiv preprint arXiv:2311.07562","author":"Yan An","year":"2023","unstructured":"An Yan, Zhengyuan Yang, Wanrong Zhu, Kevin Lin, Linjie Li, Jianfeng Wang, Jianwei Yang, Yiwu Zhong, Julian McAuley, Jianfeng Gao, 2023. GPT-4V in Wonderland: Large Multimodal Models for Zero-Shot Smartphone GUI Navigation. arXiv preprint arXiv:2311.07562 (2023)."},{"key":"e_1_3_2_1_36_1","volume-title":"AppAgent: Multimodal Agents as Smartphone Users. arXiv preprint arXiv:2312.13771","author":"Yang Zhao","year":"2023","unstructured":"Zhao Yang, Jiaxuan Liu, Yucheng Han, Xin Chen, Zebiao Huang, Bin Fu, and Gang Yu. 2023. AppAgent: Multimodal Agents as Smartphone Users. arXiv preprint arXiv:2312.13771 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"You only look at screens: Multimodal chain-of-action agents. arXiv preprint arXiv:2309.11436","author":"Zhan Zhuosheng","year":"2023","unstructured":"Zhuosheng Zhan and Aston Zhang. 2023. You only look at screens: Multimodal chain-of-action agents. arXiv preprint arXiv:2309.11436 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Mobile-env: A universal platform for training and evaluation of mobile interaction. arXiv preprint arXiv:2305.08144","author":"Zhang Danyang","year":"2023","unstructured":"Danyang Zhang, Lu Chen, and Kai Yu. 2023. Mobile-env: A universal platform for training and evaluation of mobile interaction. arXiv preprint arXiv:2305.08144 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Webarena: A realistic web environment for building autonomous agents. arXiv preprint arXiv:2307.13854","author":"Zhou Shuyan","year":"2023","unstructured":"Shuyan Zhou, Frank\u00a0F Xu, Hao Zhu, Xuhui Zhou, Robert Lo, Abishek Sridhar, Xianyi Cheng, Yonatan Bisk, Daniel Fried, Uri Alon, 2023. Webarena: A realistic web environment for building autonomous agents. arXiv preprint arXiv:2307.13854 (2023)."}],"event":{"name":"UIST '24: The 37th Annual ACM Symposium on User Interface Software and Technology","location":"Pittsburgh PA USA","acronym":"UIST '24"},"container-title":["Proceedings of the 37th Annual ACM Symposium on User Interface Software and Technology"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3654777.3676382","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3654777.3676382","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,4]],"date-time":"2025-08-04T21:15:39Z","timestamp":1754342139000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3654777.3676382"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,11]]},"references-count":39,"alternative-id":["10.1145\/3654777.3676382","10.1145\/3654777"],"URL":"https:\/\/doi.org\/10.1145\/3654777.3676382","relation":{},"subject":[],"published":{"date-parts":[[2024,10,11]]},"assertion":[{"value":"2024-10-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}