{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:14:32Z","timestamp":1765340072817,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372187"],"award-info":[{"award-number":["62372187"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFC3601005"],"award-info":[{"award-number":["2022YFC3601005"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangdong Provincial Key Laboratory of Human Digital Twin","award":["2022B1212010004"],"award-info":[{"award-number":["2022B1212010004"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755695","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"8797-8805","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MVISU-Bench: Benchmarking Mobile Agents for Real-World Tasks by Multi-App, Vague, Interactive, Single-App and Unethical Instructions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2578-0135","authenticated-orcid":false,"given":"Zeyu","family":"Huang","sequence":"first","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8263-1861","authenticated-orcid":false,"given":"Juyuan","family":"Wang","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0478-7684","authenticated-orcid":false,"given":"Longfeng","family":"Chen","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5010-9805","authenticated-orcid":false,"given":"Boyi","family":"Xiao","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4259-2350","authenticated-orcid":false,"given":"Leng","family":"Cai","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1908-1157","authenticated-orcid":false,"given":"Yawen","family":"Zeng","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8735-3532","authenticated-orcid":false,"given":"Jin","family":"Xu","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guanzhou, China and Pazhou Lab, Guanzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu et al. 2025. Qwen2.5-VL Technical Report. arXiv:2502.13923 [cs.CV] https:\/\/arxiv.org\/abs\/2502.13923"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Leng Cai Junxuan He Yikai Li Junjie Liang Yuanping Lin Ziming Quan Yawen Zeng and Jin Xu. 2025. RTBAgent: A LLM-based Agent System for Real-Time Bidding. arXiv:2502.00792 [cs.AI] https:\/\/arxiv.org\/abs\/2502.00792","DOI":"10.1145\/3701716.3715259"},{"key":"e_1_3_2_1_3_1","volume-title":"Gui-world: A dataset for gui-oriented multimodal llm-based agents. arXiv e-prints","author":"Dongping Chen","year":"2024","unstructured":"Dongping Chen et al., 2024a. Gui-world: A dataset for gui-oriented multimodal llm-based agents. arXiv e-prints (2024), arXiv-2406."},{"key":"e_1_3_2_1_4_1","volume-title":"NeurIPS 2024 Workshop on Open-World Agents.","author":"Chen Jingxuan","year":"2024","unstructured":"Jingxuan Chen, Derek Yuen, Bin Xie, Yuhao Yang, Gongwei Chen, Zhihao Wu, Li Yixing, Xurui Zhou, Weiwen Liu, Shuai Wang, et al., 2024b. Spa-bench: A comprehensive benchmark for smartphone agent evaluation. In NeurIPS 2024 Workshop on Open-World Agents."},{"key":"e_1_3_2_1_5_1","unstructured":"Peng Chen Pi Bu Yingyao Wang Xinyi Wang Ziming Wang Jie Guo Yingxiu Zhao Qi Zhu Jun Song Siran Yang Jiamang Wang and Bo Zheng. 2025. CombatVLA: An Efficient Vision-Language-Action Model for Combat Tasks in 3D Action Role-Playing Games. arXiv:2503.09527 [cs.CV] https:\/\/arxiv.org\/abs\/2503.09527"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00808"},{"key":"e_1_3_2_1_7_1","volume-title":"Mobile-bench: An evaluation benchmark for llm-based mobile agents. arXiv preprint arXiv:2407.00993","author":"Deng Shihan","year":"2024","unstructured":"Shihan Deng, Weikai Xu, Hongda Sun, Wei Liu, Tao Tan, Jianfeng Liu, Ang Li, Jian Luan, Bin Wang, Rui Yan, et al., 2024. Mobile-bench: An evaluation benchmark for llm-based mobile agents. arXiv preprint arXiv:2407.00993 (2024)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3684998"},{"key":"e_1_3_2_1_9_1","unstructured":"Jihao Gu Qihang Ai Yingyao Wang Pi Bu Jingxuan Xing Zekun Zhu Wei Jiang Ziming Wang Yingxiu Zhao Ming-Liang Zhang Jun Song Yuning Jiang and Bo Zheng. 2025. Mobile-R1: Towards Interactive Reinforcement Learning for VLM-Based Mobile Agent via Task-Level Rewards. arXiv:2506.20332 [cs.AI] https:\/\/arxiv.org\/abs\/2506.20332"},{"key":"e_1_3_2_1_10_1","volume-title":"AI Works-A Cognitive Journey into Digital World. arXiv preprint arXiv:2412.17589","author":"He Yanheng","year":"2024","unstructured":"Yanheng He, Jiahe Jin, Shijie Xia, Jiadi Su, Runze Fan, Haoyang Zou, Xiangkun Hu, and Pengfei Liu. 2024. PC Agent: While You Sleep, AI Works-A Cognitive Journey into Digital World. arXiv preprint arXiv:2412.17589 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Zijuan Lin, Liyang Zhou, et al.","author":"Hong Sirui","year":"2023","unstructured":"Sirui Hong, Xiawu Zheng, Jonathan Chen, Yuheng Cheng, Jinlin Wang, Ceyao Zhang, Zili Wang, Steven Ka Shing Yau, Zijuan Lin, Liyang Zhou, et al., 2023. Metagpt: Meta programming for multi-agent collaborative framework. arXiv preprint arXiv:2308.00352, Vol. 3, 4 (2023), 6."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01354"},{"key":"e_1_3_2_1_13_1","volume-title":"AppAgentX: Evolving GUI Agents as Proficient Smartphone Users. arXiv preprint arXiv:2503.02268","author":"Jiang Wenjia","year":"2025","unstructured":"Wenjia Jiang, Yangyang Zhuang, Chenxi Song, Xu Yang, and Chi Zhang. 2025. AppAgentX: Evolving GUI Agents as Proficient Smartphone Users. arXiv preprint arXiv:2503.02268 (2025)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671620"},{"key":"e_1_3_2_1_15_1","volume-title":"W Bradley Knox, and Kimin Lee.","author":"Lee Juyong","year":"2024","unstructured":"Juyong Lee, Dongyoon Hahm, June Suk Choi, W Bradley Knox, and Kimin Lee. 2024. Mobilesafetybench: Evaluating safety of autonomous agents in mobile device control. arXiv preprint arXiv:2410.17520 (2024)."},{"key":"e_1_3_2_1_16_1","unstructured":"Xiangyu Li Yawen Zeng Xiaofen Xing Jin Xu and Xiangmin Xu. 2025. HedgeAgents: A Balanced-aware Multi-agent Financial Trading System. arXiv:2502.13165 [cs.MA] https:\/\/arxiv.org\/abs\/2502.13165"},{"key":"e_1_3_2_1_17_1","volume-title":"Mapping natural language instructions to mobile UI action sequences. arXiv preprint arXiv:2005.03776","author":"Li Yang","year":"2020","unstructured":"Yang Li, Jiacong He, Xin Zhou, Yuan Zhang, and Jason Baldridge. 2020. Mapping natural language instructions to mobile UI action sequences. arXiv preprint arXiv:2005.03776 (2020)."},{"key":"e_1_3_2_1_18_1","volume-title":"Appagent v2: Advanced agent for flexible mobile interactions. arXiv preprint arXiv:2408.11824","author":"Li Yanda","year":"2024","unstructured":"Yanda Li, Chi Zhang, Wanqi Yang, Bin Fu, Pei Cheng, Xin Chen, Ling Chen, and Yunchao Wei. 2024. Appagent v2: Advanced agent for flexible mobile interactions. arXiv preprint arXiv:2408.11824 (2024)."},{"key":"e_1_3_2_1_19_1","unstructured":"Haowei Liu Xi Zhang Haiyang Xu Yuyang Wanyan Junyang Wang Ming Yan Ji Zhang Chunfeng Yuan Changsheng Xu Weiming Hu et al. 2025. PC-Agent: A Hierarchical Multi-Agent Collaboration Framework for Complex Task Automation on PC. arXiv preprint arXiv:2502.14282 (2025)."},{"key":"e_1_3_2_1_20_1","volume-title":"Jiadai Sun, Jiaqi Wang, et al.","author":"Liu Xiao","year":"2024","unstructured":"Xiao Liu, Bo Qin, Dongzhu Liang, Guang Dong, Hanyu Lai, Hanchen Zhang, Hanlin Zhao, Iat Long Iong, Jiadai Sun, Jiaqi Wang, et al., 2024. Autoglm: Autonomous foundation agents for guis. arXiv preprint arXiv:2411.00820 (2024)."},{"key":"e_1_3_2_1_21_1","volume-title":"Text2Event: Controllable sequence-to-structure generation for end-to-end event extraction. arXiv preprint arXiv:2106.09232","author":"Lu Yaojie","year":"2021","unstructured":"Yaojie Lu, Hongyu Lin, Jin Xu, Xianpei Han, Jialong Tang, Annan Li, Le Sun, Meng Liao, and Shaoyi Chen. 2021. Text2Event: Controllable sequence-to-structure generation for end-to-end event extraction. arXiv preprint arXiv:2106.09232 (2021)."},{"key":"e_1_3_2_1_22_1","volume-title":"Omniparser for pure vision based gui agent. arXiv preprint arXiv:2408.00203","author":"Lu Yadong","year":"2024","unstructured":"Yadong Lu, Jianwei Yang, Yelong Shen, and Ahmed Awadallah. 2024. Omniparser for pure vision based gui agent. arXiv preprint arXiv:2408.00203 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"Coco-agent: A comprehensive cognitive mllm agent for smartphone gui automation. arXiv preprint arXiv:2402.11941","author":"Ma Xinbei","year":"2024","unstructured":"Xinbei Ma, Zhuosheng Zhang, and Hai Zhao. 2024. Coco-agent: A comprehensive cognitive mllm agent for smartphone gui automation. arXiv preprint arXiv:2402.11941 (2024)."},{"key":"e_1_3_2_1_24_1","unstructured":"Yujia Qin Yining Ye Junjie Fang Haoming Wang Shihao Liang Shizuo Tian Junda Zhang Jiahao Li Yunxin Li Shijue Huang et al. 2025. UI-TARS: Pioneering Automated GUI Interaction with Native Agents. arXiv preprint arXiv:2501.12326 (2025)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680792"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676386"},{"key":"e_1_3_2_1_27_1","volume-title":"AutoEval: A Practical Framework for Autonomous Evaluation of Mobile Agents. arXiv preprint arXiv:2503.02403","author":"Sun Jiahui","year":"2025","unstructured":"Jiahui Sun, Zhichao Hua, and Yubin Xia. 2025. AutoEval: A Practical Framework for Autonomous Evaluation of Mobile Agents. arXiv preprint arXiv:2503.02403 (2025)."},{"key":"e_1_3_2_1_28_1","volume-title":"Meta-gui: Towards multi-modal conversational agents on mobile gui. arXiv preprint arXiv:2205.11029","author":"Sun Liangtai","year":"2022","unstructured":"Liangtai Sun, Xingyu Chen, Lu Chen, Tianle Dai, Zichen Zhu, and Kai Yu. 2022. Meta-gui: Towards multi-modal conversational agents on mobile gui. arXiv preprint arXiv:2205.11029 (2022)."},{"key":"e_1_3_2_1_29_1","volume-title":"Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191","author":"Team Qwen","year":"2024","unstructured":"Qwen Team. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_30_1","volume-title":"Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration. arXiv preprint arXiv:2406.01014","author":"Wang Junyang","year":"2024","unstructured":"Junyang Wang, Haiyang Xu, Haitao Jia, Xi Zhang, Ming Yan, Weizhou Shen, Ji Zhang, Fei Huang, and Jitao Sang. 2024b. Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration. arXiv preprint arXiv:2406.01014 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Mobile-agent: Autonomous multi-modal mobile device agent with visual perception. arXiv preprint arXiv:2401.16158","author":"Wang Junyang","year":"2024","unstructured":"Junyang Wang, Haiyang Xu, Jiabo Ye, Ming Yan, Weizhou Shen, Ji Zhang, Fei Huang, and Jitao Sang. 2024c. Mobile-agent: Autonomous multi-modal mobile device agent with visual perception. arXiv preprint arXiv:2401.16158 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"MobileAgentBench: An Efficient and User-Friendly Benchmark for Mobile LLM Agents. arXiv preprint arXiv:2406.08184","author":"Wang Luyuan","year":"2024","unstructured":"Luyuan Wang, Yongyu Deng, Yiwei Zha, Guodong Mao, Qinmin Wang, Tianchen Min, Wei Chen, and Shoufa Chen. 2024a. MobileAgentBench: An Efficient and User-Friendly Benchmark for Mobile LLM Agents. arXiv preprint arXiv:2406.08184 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"Mobile-Agent-E: Self-Evolving Mobile Assistant for Complex Tasks. arXiv preprint arXiv:2501.11733","author":"Wang Zhenhailong","year":"2025","unstructured":"Zhenhailong Wang, Haiyang Xu, Junyang Wang, Xi Zhang, Ming Yan, Ji Zhang, Fei Huang, and Heng Ji. 2025. Mobile-Agent-E: Self-Evolving Mobile Assistant for Complex Tasks. arXiv preprint arXiv:2501.11733 (2025)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649379"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Hao Wen Shizuo Tian Borislav Pavlov Wenjie Du Yixuan Li Ge Chang Shanhui Zhao Jiacheng Liu Yunxin Liu Ya-Qin Zhang et al. 2024b. AutoDroid-V2: Boosting SLM-based GUI Agents via Code Generation. arXiv preprint arXiv:2412.18116 (2024).","DOI":"10.1145\/3711875.3729134"},{"key":"e_1_3_2_1_36_1","volume-title":"Droidbot-gpt: Gpt-powered ui automation for android. arXiv preprint arXiv:2304.07061","author":"Wen Hao","year":"2023","unstructured":"Hao Wen, Hongming Wang, Jiaxuan Liu, and Yuanchun Li. 2023. Droidbot-gpt: Gpt-powered ui automation for android. arXiv preprint arXiv:2304.07061 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671650"},{"key":"e_1_3_2_1_38_1","volume-title":"Androidlab: Training and systematic benchmarking of android autonomous agents. arXiv preprint arXiv:2410.24024","author":"Xu Yifan","year":"2024","unstructured":"Yifan Xu, Xiao Liu, Xueqiao Sun, Siyi Cheng, Hao Yu, Hanyu Lai, Shudan Zhang, Dan Zhang, Jie Tang, and Yuxiao Dong. 2024. Androidlab: Training and systematic benchmarking of android autonomous agents. arXiv preprint arXiv:2410.24024 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Auto-gpt for online decision making: Benchmarks and additional opinions. arXiv preprint arXiv:2306.02224","author":"Yang Hui","year":"2023","unstructured":"Hui Yang, Sifu Yue, and Yunzhong He. 2023. Auto-gpt for online decision making: Benchmarks and additional opinions. arXiv preprint arXiv:2306.02224 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"European Conference on Computer Vision. Springer, 240-255","author":"You Keen","year":"2024","unstructured":"Keen You, Haotian Zhang, Eldon Schoop, Floris Weers, Amanda Swearngin, Jeffrey Nichols, Yinfei Yang, and Zhe Gan. 2024. Ferret-ui: Grounded mobile ui understanding with multimodal llms. In European Conference on Computer Vision. Springer, 240-255."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00225"},{"key":"e_1_3_2_1_42_1","unstructured":"Chi Zhang Zhao Yang Jiaxuan Liu Yucheng Han Xin Chen Zebiao Huang Bin Fu and Gang Yu. 2023. AppAgent: Multimodal Agents as Smartphone Users. arXiv:2312.13771 [cs.CV]"},{"key":"e_1_3_2_1_43_1","volume-title":"Android in the zoo: Chain-of-action-thought for gui agents. arXiv preprint arXiv:2403.02713","author":"Zhang Jiwen","year":"2024","unstructured":"Jiwen Zhang, Jihao Wu, Yihua Teng, Minghui Liao, Nuo Xu, Xiao Xiao, Zhongyu Wei, and Duyu Tang. 2024b. Android in the zoo: Chain-of-action-thought for gui agents. arXiv preprint arXiv:2403.02713 (2024)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676382"},{"key":"e_1_3_2_1_45_1","volume-title":"GUI Testing Arena: A Unified Benchmark for Advancing Autonomous GUI Testing Agent. arXiv preprint arXiv:2412.18426","author":"Zhao Kangjia","year":"2024","unstructured":"Kangjia Zhao, Jiahui Song, Leigang Sha, HaoZhan Shen, Zhi Chen, Tiancheng Zhao, Xiubo Liang, and Jianwei Yin. 2024. GUI Testing Arena: A Unified Benchmark for Advancing Autonomous GUI Testing Agent. arXiv preprint arXiv:2412.18426 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755695","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:10:14Z","timestamp":1765339814000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755695"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":45,"alternative-id":["10.1145\/3746027.3755695","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755695","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}