{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,10]],"date-time":"2026-06-10T02:53:29Z","timestamp":1781060009092,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,29]],"date-time":"2024-05-29T00:00:00Z","timestamp":1716940800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"National Key R&D Program of China","award":["No.2022YFF0604501"],"award-info":[{"award-number":["No.2022YFF0604501"]}]},{"name":"NSFC","award":["No.62272261"],"award-info":[{"award-number":["No.62272261"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,29]]},"DOI":"10.1145\/3636534.3649379","type":"proceedings-article","created":{"date-parts":[[2024,5,29]],"date-time":"2024-05-29T13:32:55Z","timestamp":1716989575000},"page":"543-557","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":89,"title":["AutoDroid: LLM-powered Task Automation in Android"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-8450-7795","authenticated-orcid":false,"given":"Hao","family":"Wen","sequence":"first","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University, Beijing, -Select-, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1591-2526","authenticated-orcid":false,"given":"Yuanchun","family":"Li","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5959-8604","authenticated-orcid":false,"given":"Guohong","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4205-0770","authenticated-orcid":false,"given":"Shanhui","family":"Zhao","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8815-7407","authenticated-orcid":false,"given":"Tao","family":"Yu","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7902-7625","authenticated-orcid":false,"given":"Toby Jia-Jun","family":"Li","sequence":"additional","affiliation":[{"name":"University of Notre Dame, Notre Dame, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4685-9633","authenticated-orcid":false,"given":"Shiqi","family":"Jiang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8052-9200","authenticated-orcid":false,"given":"Yunhao","family":"Liu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4515-6212","authenticated-orcid":false,"given":"Yaqin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7352-8955","authenticated-orcid":false,"given":"Yunxin","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,5,29]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Anthropic. 2023. Claude. https:\/\/www.anthropic.com\/product."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2906388.2906416"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_18"},{"key":"e_1_3_2_1_4_1","unstructured":"Mark Chen Jerry Tworek Heewoo Jun et al. 2021. Evaluating Large Language Models Trained on Code. (2021). arXiv:cs.LG\/2107.03374"},{"key":"e_1_3_2_1_5_1","volume-title":"Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, et al. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"e_1_3_2_1_6_1","unstructured":"Aakanksha Chowdhery Sharan Narang Jacob Devlin Maarten Bosma et al. 2022. PaLM: Scaling Language Modeling with Pathways. arXiv:cs.CL\/2204.02311"},{"key":"e_1_3_2_1_7_1","unstructured":"Karl Cobbe Vineet Kosaraju Mohammad Bavarian et al. 2021. Training Verifiers to Solve Math Word Problems. arXiv preprint arXiv:2110.14168 (2021)."},{"key":"e_1_3_2_1_8_1","unstructured":"Xiang Deng Yu Gu Boyuan Zheng et al. 2023. Mind2Web: Towards a Generalist Agent for the Web. arXiv:cs.CL\/2306.06070"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Cheng-Yu Hsieh Chun-Liang Li Chih-Kuan Yeh et al. 2023. Distilling step-by-step! outperforming larger language models with less training data and smaller model sizes. arXiv preprint arXiv:2305.02301 (2023).","DOI":"10.18653\/v1\/2023.findings-acl.507"},{"key":"e_1_3_2_1_10_1","volume-title":"International Conference on Machine Learning. PMLR, 9466--9482","author":"Humphreys Peter C","year":"2022","unstructured":"Peter C Humphreys, David Raposo, Tobias Pohlen, et al. 2022. A data-driven approach for learning to control computers. In International Conference on Machine Learning. PMLR, 9466--9482."},{"key":"e_1_3_2_1_11_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS","volume":"35","author":"Kojima Takeshi","year":"2022","unstructured":"Takeshi Kojima, Shixiang (Shane) Gu, Machel Reid, et al. 2022. Large Language Models are Zero-Shot Reasoners. In Advances in Neural Information Processing Systems (NeurIPS 2022), Vol. 35. 22199--22213."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3495243.3560522"},{"key":"e_1_3_2_1_13_1","volume-title":"The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=9yE2xEj0BH7","author":"Li Gang","year":"2023","unstructured":"Gang Li and Yang Li. 2023. Spotlight: Mobile UI Understanding using Vision-Language Models with a Focus. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=9yE2xEj0BH7"},{"key":"e_1_3_2_1_14_1","volume-title":"EndUser Development: 6th International Symposium, IS-EUD 2017, Eindhoven, The Netherlands, June 13--15, 2017, Proceedings 6. Springer, 3--17","author":"Jia-Jun Li Toby","year":"2017","unstructured":"Toby Jia-Jun Li, Yuanchun Li, Fanglin Chen, and Brad A Myers. 2017. Programming IoT devices by demonstration using mobile apps. In EndUser Development: 6th International Symposium, IS-EUD 2017, Eindhoven, The Netherlands, June 13--15, 2017, Proceedings 6. Springer, 3--17."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3210240.3210339"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.729"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.443"},{"key":"e_1_3_2_1_18_1","volume-title":"VUT: Versatile UI Transformer for Multimodal Multi-Task User Interface Modeling. In International Conference on Learning Representations (ICLR). https:\/\/openreview.net\/forum?id=rF5UoZFrsF4","author":"Li Yang","unstructured":"Yang Li, Gang Li, Xin Zhou, Mostafa Dehghani, and Alexey A. Gritsenko. 2022. VUT: Versatile UI Transformer for Multimodal Multi-Task User Interface Modeling. In International Conference on Learning Representations (ICLR). https:\/\/openreview.net\/forum?id=rF5UoZFrsF4"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462905"},{"key":"e_1_3_2_1_20_1","unstructured":"Yuanchun Li Hao Wen Weijun Wang et al. 2024. Personal LLM Agents: Insights and Survey about the Capability Efficiency and Security. arXiv preprint arXiv:2401.05459 (2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-C.2017.8"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASE.2019.00104"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2639108.2639131"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3570361.3613259"},{"key":"e_1_3_2_1_25_1","volume-title":"Song-Chun Zhu, and Jianfeng Gao.","author":"Lu Pan","year":"2023","unstructured":"Pan Lu, Baolin Peng, Hao Cheng, Michel Galley, Kai-Wei Chang, Ying Nian Wu, Song-Chun Zhu, and Jianfeng Gao. 2023. Chameleon: Plug-and-Play Compositional Reasoning with Large Language Models. arXiv:cs.CL\/2304.09842"},{"key":"e_1_3_2_1_26_1","unstructured":"Microsoft. 2023. PII Codex. [Online]. Available at: https:\/\/github.com\/EdyVision\/pii-codex."},{"key":"e_1_3_2_1_27_1","unstructured":"Reiichiro Nakano Jacob Hilton Suchir Balaji et al. 2022. WebGPT: Browser-assisted question-answering with human feedback. (2022). arXiv:cs.CL\/2112.09332"},{"key":"e_1_3_2_1_28_1","unstructured":"OpenAI. 2022. ChatGPT. [Online]. Available at: https:\/\/openai.com\/blog\/chatgpt\/."},{"key":"e_1_3_2_1_30_1","volume-title":"Gonzalez","author":"Patil Shishir G.","year":"2023","unstructured":"Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez. 2023. Gorilla: Large Language Model Connected with Massive APIs. arXiv:cs.CL\/2305.15334"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.26599\/TST.2023.9010119"},{"key":"e_1_3_2_1_32_1","unstructured":"Yongliang Shen Kaitao Song Xu Tan Dongsheng Li Weiming Lu and Yueting Zhuang. 2023. HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face. arXiv:cs.CL\/2303.17580"},{"key":"e_1_3_2_1_33_1","volume-title":"Any Task: Instruction-Finetuned Text Embeddings. In Findings of the Association for Computational Linguistics: ACL 2023","author":"Su Hongjin","year":"2023","unstructured":"Hongjin Su, Weijia Shi, Jungo Kasai, et al. 2023. One Embedder, Any Task: Instruction-Finetuned Text Embeddings. In Findings of the Association for Computational Linguistics: ACL 2023. Association for Computational Linguistics, Toronto, Canada, 1102--1121. https:\/\/aclanthology.org\/2023.findings-acl.71"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3121049"},{"key":"e_1_3_2_1_35_1","volume-title":"META-GUI: Towards Multi-modal Conversational Agents on Mobile GUI. arXiv preprint arXiv:2205.11029","author":"Sun Liangtai","year":"2022","unstructured":"Liangtai Sun, Xingyu Chen, Lu Chen, Tianle Dai, Zichen Zhu, and Kai Yu. 2022. META-GUI: Towards Multi-modal Conversational Agents on Mobile GUI. arXiv preprint arXiv:2205.11029 (2022)."},{"key":"e_1_3_2_1_36_1","volume-title":"Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca. GitHub repository","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, et al. 2023. Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca. GitHub repository (2023)."},{"key":"e_1_3_2_1_37_1","unstructured":"MLC team. 2023. MLC-LLM. https:\/\/github.com\/mlc-ai\/mlc-llm"},{"key":"e_1_3_2_1_38_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard et al. 2023. LLaMA: Open and Efficient Foundation Language Models. (2023). arXiv:cs.CL\/2302.13971"},{"key":"e_1_3_2_1_39_1","unstructured":"Daniel Toyama Philippe Hamel Anita Gergely et al. 2021. AndroidEnv: A Reinforcement Learning Platform for Android. abs\/2105.13231 (2021). arXiv:cs.LG\/2105.13231 http:\/\/arxiv.org\/abs\/2105.13231"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_41_1","volume-title":"UGIF: UI Grounded Instruction Following. arXiv preprint arXiv:2211.07615","author":"Venkatesh Sagar Gubbi","year":"2022","unstructured":"Sagar Gubbi Venkatesh, Partha Talukdar, and Srini Narayanan. 2022. UGIF: UI Grounded Instruction Following. arXiv preprint arXiv:2211.07615 (2022)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580895"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472749.3474765"},{"key":"e_1_3_2_1_44_1","volume-title":"Oh (Eds.)","volume":"35","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, et al. 2022. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. In Advances in Neural Information Processing Systems, S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh (Eds.), Vol. 35. Curran Associates, Inc., 24824--24837. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/9d5609613524ecf4f15af0f7b31abca4-Paper-Conference.pdf"},{"key":"e_1_3_2_1_45_1","volume-title":"DroidBot-GPT: GPT-powered UI Automation for Android. arXiv preprint arXiv:2304.07061","author":"Wen Hao","year":"2023","unstructured":"Hao Wen, Hongming Wang, Jiaxuan Liu, and Yuanchun Li. 2023. DroidBot-GPT: GPT-powered UI Automation for Android. arXiv preprint arXiv:2304.07061 (2023)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3117811.3117819"},{"key":"e_1_3_2_1_47_1","volume-title":"ReAct: Synergizing Reasoning and Acting in Language Models. In International Conference on Learning Representations (ICLR).","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao. 2023. ReAct: Synergizing Reasoning and Acting in Language Models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445186"},{"key":"e_1_3_2_1_49_1","unstructured":"Zhizheng Zhang Xiaoyi Zhang Wenxuan Xie and Yan Lu. 2023. Responsible Task Automation: Empowering Large Language Models as Responsible Task Automators. arXiv:cs.AI\/2306.01242"}],"event":{"name":"ACM MobiCom '24: 30th Annual International Conference on Mobile Computing and Networking","location":"Washington D.C. DC USA","acronym":"ACM MobiCom '24","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing"]},"container-title":["Proceedings of the 30th Annual International Conference on Mobile Computing and Networking"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3636534.3649379","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3636534.3649379","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T22:54:12Z","timestamp":1750287252000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3636534.3649379"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,29]]},"references-count":48,"alternative-id":["10.1145\/3636534.3649379","10.1145\/3636534"],"URL":"https:\/\/doi.org\/10.1145\/3636534.3649379","relation":{},"subject":[],"published":{"date-parts":[[2024,5,29]]},"assertion":[{"value":"2024-05-29","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}