{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:15:59Z","timestamp":1765307759985,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","funder":[{"name":"Open Foundation of Key Laboratory of Cyberspace Security, Ministry of Education","award":["No.KLCS20240207"],"award-info":[{"award-number":["No.KLCS20240207"]}]},{"name":"the National Natural Science Foundation of China","award":["62472398,U2336206"],"award-info":[{"award-number":["62472398,U2336206"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755553","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:44:48Z","timestamp":1761371088000},"page":"4679-4687","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MMPro: A Decoupled Perception-Thinking-Execution Framework for Secure GUI Agent"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-4441-173X","authenticated-orcid":false,"given":"Benlong","family":"Wu","sequence":"first","affiliation":[{"name":"School of Cyber Science and Technology, University of Science and Technology of China, HeFei, China and Key Laboratory of Cyberspace Security, Ministry of Education, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0326-1616","authenticated-orcid":false,"given":"Yuang","family":"Qi","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Technology, University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6660-9947","authenticated-orcid":false,"given":"Xiuwei","family":"Shang","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Technology, University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5576-6108","authenticated-orcid":false,"given":"Weiming","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Technology, University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4417-9316","authenticated-orcid":false,"given":"Nenghai","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Technology, University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9868-3414","authenticated-orcid":false,"given":"Kejiang","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Technology, University of Science and Technology of China, Hefei, China and Key Laboratory of Cyberspace Security, Ministry of Education, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, et al.","author":"Abdin Marah","year":"2024","unstructured":"Marah Abdin, Jyoti Aneja, Hany Awadalla, Ahmed Awadallah, Ammar Ahmad Awan, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, et al., 2024. Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv:2404.14219 (2024)."},{"key":"e_1_3_2_1_2_1","volume-title":"Agent s: An open agentic framework that uses computers like a human. arXiv preprint arXiv:2410.08164","author":"Agashe Saaket","year":"2024","unstructured":"Saaket Agashe, Jiuzhou Han, Shuyu Gan, Jiachen Yang, Ang Li, and Xin Eric Wang. 2024. Agent s: An open agentic framework that uses computers like a human. arXiv preprint arXiv:2410.08164 (2024)."},{"key":"e_1_3_2_1_3_1","unstructured":"Anthropic. 2024a. Claude 3 Model Card. https:\/\/www-cdn.anthropic.com\/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627\/Model_Card_Claude_3.pdf Accessed: 2024-03-04."},{"key":"e_1_3_2_1_4_1","unstructured":"Anthropic. 2024b. Claude 3.5 Sonnet Released. https:\/\/www.anthropic.com\/news\/claude-3-5-sonnet Accessed: 2024-06-20."},{"key":"e_1_3_2_1_5_1","unstructured":"Anthropic. 2025. Claude 3.7 Sonnet Released. https:\/\/www.anthropic.com\/news\/claude-3-7-sonnet Accessed: 2025-04-02."},{"key":"e_1_3_2_1_6_1","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding Localization Text Reading and Beyond. arXiv:2308.12966 [cs.CV] https:\/\/arxiv.org\/abs\/2308.12966"},{"key":"e_1_3_2_1_7_1","unstructured":"Rogerio Bonatti Dan Zhao Francesco Bonacci Dillon Dupont Sara Abdali Yinheng Li Yadong Lu Justin Wagle Kazuhito Koishida Arthur Bucker et al. 2024. Windows agent arena: Evaluating multi-modal os agents at scale. arXiv preprint arXiv:2409.08264 (2024)."},{"key":"e_1_3_2_1_8_1","volume-title":"Seeclick: Harnessing gui grounding for advanced visual gui agents. arXiv preprint arXiv:2401.10935","author":"Cheng Kanzhi","year":"2024","unstructured":"Kanzhi Cheng, Qiushi Sun, Yougang Chu, Fangzhi Xu, Yantao Li, Jianbing Zhang, and Zhiyong Wu. 2024. Seeclick: Harnessing gui grounding for advanced visual gui agents. arXiv preprint arXiv:2401.10935 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.bandl.2012.08.003"},{"key":"e_1_3_2_1_10_1","unstructured":"DeepSeek-AI and Daya Guo et.al. 2025. DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948 [cs.CL] https:\/\/arxiv.org\/abs\/2501.12948"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1126\/science.1072165"},{"key":"e_1_3_2_1_12_1","volume-title":"What'is happening in the dorsal visual pathway. Trends in cognitive sciences","author":"Freud Erez","year":"2016","unstructured":"Erez Freud, David C Plaut, and Marlene Behrmann. 2016. 'What'is happening in the dorsal visual pathway. Trends in cognitive sciences, Vol. 20, 10 (2016), 773-784."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3684998"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681488"},{"key":"e_1_3_2_1_15_1","volume-title":"Mustafa Safdari, Yutaka Matsuo, Douglas Eck, and Aleksandra Faust.","author":"Gur Izzeddin","year":"2023","unstructured":"Izzeddin Gur, Hiroki Furuta, Austin Huang, Mustafa Safdari, Yutaka Matsuo, Douglas Eck, and Aleksandra Faust. 2023. A real-world webagent with planning, long context understanding, and program synthesis. arXiv preprint arXiv:2307.12856 (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3106237.3106270"},{"key":"e_1_3_2_1_17_1","volume-title":"Cogagent: A visual language model for gui agents. 14281-14290.","author":"Hong Wenyi","year":"2024","unstructured":"Wenyi Hong, Weihan Wang, Qingsong Lv, Jiazheng Xu, Wenmeng Yu, Junhui Ji, Yan Wang, Zihan Wang, Yuxiao Dong, Ming Ding, et al., 2024. Cogagent: A visual language model for gui agents. 14281-14290."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"e_1_3_2_1_19_1","volume-title":"Neural transplantation studies reveal the brain's capacity for continuous reconstruction. Trends in neurosciences","author":"Isacson Ole","year":"1997","unstructured":"Ole Isacson and Terrence Deacon. 1997. Neural transplantation studies reveal the brain's capacity for continuous reconstruction. Trends in neurosciences, Vol. 20, 10 (1997), 477-482."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/234313.234387"},{"key":"e_1_3_2_1_21_1","unstructured":"Chengyou Jia Minnan Luo Zhuohang Dang Qiushi Sun Fangzhi Xu Junlin Hu Tianbao Xie and Zhiyong Wu. 2024. AgentStore: Scalable Integration of Heterogeneous Agents As Specialized Generalist Computer Assistant. arXiv:2410.18603 [cs.AI] https:\/\/arxiv.org\/abs\/2410.18603"},{"key":"e_1_3_2_1_22_1","volume-title":"Openvla: An open-source vision-language-action model. arXiv preprint arXiv:2406.09246","author":"Kim Moo Jin","year":"2024","unstructured":"Moo Jin Kim, Karl Pertsch, Siddharth Karamcheti, Ted Xiao, Ashwin Balakrishna, Suraj Nair, Rafael Rafailov, Ethan Foster, Grace Lam, Pannag Sanketi, et al., 2024. Openvla: An open-source vision-language-action model. arXiv preprint arXiv:2406.09246 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_24_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_25_1","volume-title":"Jiadai Sun, Jiaqi Wang, et al.","author":"Liu Xiao","year":"2024","unstructured":"Xiao Liu, Bo Qin, Dongzhu Liang, Guang Dong, Hanyu Lai, Hanchen Zhang, Hanlin Zhao, Iat Long Iong, Jiadai Sun, Jiaqi Wang, et al., 2024. Autoglm: Autonomous foundation agents for guis. arXiv preprint arXiv:2411.00820 (2024)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01480"},{"key":"e_1_3_2_1_27_1","unstructured":"OpenAI. 2023. GPT-4V(ision) System Card. https:\/\/openai.com\/research\/gpt-4v-system-card. Accessed: 2023-09-25."},{"key":"e_1_3_2_1_28_1","unstructured":"OpenAI. 2025. GPT-4o System Card. https:\/\/openai.com\/index\/gpt-4o-system-card\/ Accessed: 2025-04-02."},{"key":"e_1_3_2_1_29_1","volume-title":"Agent q: Advanced reasoning and learning for autonomous ai agents. arXiv preprint arXiv:2408.07199","author":"Putta Pranav","year":"2024","unstructured":"Pranav Putta, Edmund Mills, Naman Garg, Sumeet Motwani, Chelsea Finn, Divyansh Garg, and Rafael Rafailov. 2024. Agent q: Advanced reasoning and learning for autonomous ai agents. arXiv preprint arXiv:2408.07199 (2024)."},{"key":"e_1_3_2_1_30_1","unstructured":"Yujia Qin Yining Ye Junjie Fang Haoming Wang Shihao Liang Shizuo Tian Junda Zhang Jiahao Li Yunxin Li Shijue Huang Wanjun Zhong Kuanye Li Jiale Yang Yu Miao Woyu Lin Longxiang Liu Xu Jiang Qianli Ma Jingyu Li Xiaojun Xiao Kai Cai Chuang Li Yaowei Zheng Chaolin Jin Chen Li Xiao Zhou Minchao Wang Haoli Chen Zhaojian Li Haihua Yang Haifeng Liu Feng Lin Tao Peng Xin Liu and Guang Shi. 2025. UI-TARS: Pioneering Automated GUI Interaction with Native Agents. arXiv:2501.12326 [cs.AI] https:\/\/arxiv.org\/abs\/2501.12326"},{"key":"e_1_3_2_1_31_1","volume-title":"Androidworld: A dynamic benchmarking environment for autonomous agents. arXiv preprint arXiv:2405.14573","author":"Rawles Christopher","year":"2024","unstructured":"Christopher Rawles, Sarah Clinckemaillie, Yifan Chang, Jonathan Waltz, Gabrielle Lau, Marybeth Fair, Alice Li, William Bishop, Wei Li, Folawiyo Campbell-Ajala, et al., 2024. Androidworld: A dynamic benchmarking environment for autonomous agents. arXiv preprint arXiv:2405.14573 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"Towards Trustworthy GUI Agents: A Survey. arXiv preprint arXiv:2503.23434","author":"Shi Yucheng","year":"2025","unstructured":"Yucheng Shi, Wenhao Yu, Wenlin Yao, Wenhu Chen, and Ninghao Liu. 2025. Towards Trustworthy GUI Agents: A Survey. arXiv preprint arXiv:2503.23434 (2025)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1017\/S0140525X00037249"},{"key":"e_1_3_2_1_34_1","volume-title":"Development","volume":"145","author":"Stoeckli Esther T","year":"2018","unstructured":"Esther T Stoeckli. 2018. Understanding axon guidance: are we nearly there yet? Development, Vol. 145, 10 (2018), dev151415."},{"key":"e_1_3_2_1_35_1","unstructured":"Gemini Team and Petko Georgiev et.al. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv:2403.05530 [cs.CL] https:\/\/arxiv.org\/abs\/2403.05530"},{"key":"e_1_3_2_1_36_1","unstructured":"Qwen Team. 2025. QwQ-32B: Embracing the Power of Reinforcement Learning. https:\/\/qwenlm.github.io\/blog\/qwq-32b\/"},{"key":"e_1_3_2_1_37_1","volume-title":"InferDPT: Privacy-preserving Inference for Black-box Large Language Models","author":"Tong Meng","year":"2025","unstructured":"Meng Tong, Kejiang Chen, Jie Zhang, Yuang Qi, Weiming Zhang, Nenghai Yu, Tianwei Zhang, and Zhikun Zhang. 2025. InferDPT: Privacy-preserving Inference for Black-box Large Language Models. IEEE Transactions on Dependable and Secure Computing (2025)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6898"},{"key":"e_1_3_2_1_39_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024a. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191","author":"Wang Peng","year":"2024","unstructured":"Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Yang Fan, Kai Dang, Mengfei Du, Xuancheng Ren, Rui Men, Dayiheng Liu, Chang Zhou, Jingren Zhou, and Junyang Lin. 2024b. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680886"},{"key":"e_1_3_2_1_42_1","first-page":"52040","article-title":"Osworld: Benchmarking multimodal agents for open-ended tasks in real computer environments","volume":"37","author":"Xie Tianbao","year":"2024","unstructured":"Tianbao Xie, Danyang Zhang, Jixuan Chen, Xiaochuan Li, Siheng Zhao, Ruisheng Cao, Jing Hua Toh, Zhoujun Cheng, Dongchan Shin, Fangyu Lei, et al., 2024. Osworld: Benchmarking multimodal agents for open-ended tasks in real computer environments. Advances in Neural Information Processing Systems, Vol. 37 (2024), 52040-52094.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681597"},{"key":"e_1_3_2_1_44_1","unstructured":"Chaoyun Zhang Shilin He Jiaxu Qian Bowen Li Liqun Li Si Qin Yu Kang Minghua Ma Guyue Liu Qingwei Lin et al. 2024. Large language model-brained gui agents: A survey. arXiv preprint arXiv:2411.18279 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755553","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:13:20Z","timestamp":1765307600000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755553"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":45,"alternative-id":["10.1145\/3746027.3755553","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755553","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}