{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T13:13:24Z","timestamp":1780060404720,"version":"3.54.0"},"publisher-location":"New York, NY, USA","reference-count":85,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T00:00:00Z","timestamp":1781913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Institute of Information & Communications Technology Planning & Evaluation (IITP)","award":["RS-2024-00395134"],"award-info":[{"award-number":["RS-2024-00395134"]}]},{"name":"Institute of Information & Communications Technology Planning & Evaluation (IITP)","award":["RS-2024-00438851"],"award-info":[{"award-number":["RS-2024-00438851"]}]},{"name":"Institute of Information & Communications Technology Planning & Evaluation (IITP)","award":["RS-2024-00457882"],"award-info":[{"award-number":["RS-2024-00457882"]}]},{"name":"Institute of Information & Communications Technology Planning & Evaluation (IITP)","award":["RS-2025-02214652"],"award-info":[{"award-number":["RS-2025-02214652"]}]},{"name":"Samsung Electronics Co., Ltd.","award":["IO251210-14212-01"],"award-info":[{"award-number":["IO251210-14212-01"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,21]]},"DOI":"10.1145\/3745756.3809195","type":"proceedings-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:52:21Z","timestamp":1780059141000},"page":"144-157","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Agent-X: Full Pipeline Acceleration of On-device AI Agents"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5652-604X","authenticated-orcid":false,"given":"Jinha","family":"Chung","sequence":"first","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6443-4398","authenticated-orcid":false,"given":"Byeongjun","family":"Shin","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1297-1509","authenticated-orcid":false,"given":"Jiin","family":"Kim","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3303-8681","authenticated-orcid":false,"given":"Minsoo","family":"Rhu","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S. Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-latency Tradeoff in LLM Inference with Sarathi-serve. In Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_2_1","volume-title":"SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. In arxiv.org.","author":"Agrawal Amey","year":"2023","unstructured":"Amey Agrawal, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S. Gulavani, and Ramachandran Ramjee. 2023. SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. In arxiv.org."},{"key":"e_1_3_2_1_3_1","volume-title":"Dmitry Belenko, S Khatamifard, Minsik Cho, Carlo C Del Mundo, Mohammad Rastegari, and Mehrdad Farajtabar.","author":"Alizadeh Keivan","year":"2024","unstructured":"Keivan Alizadeh, Seyed Iman Mirzadeh, Dmitry Belenko, S Khatamifard, Minsik Cho, Carlo C Del Mundo, Mohammad Rastegari, and Mehrdad Farajtabar. 2024. LLM in a Flash: Efficient Large Language Model Inference with Limited Memory. In Proceedings of the ACL (Association for Computational Linguistics)."},{"key":"e_1_3_2_1_4_1","unstructured":"AMD. 2025. https:\/\/www.amd.com\/en\/products\/processors\/laptop\/ryzen.html"},{"key":"e_1_3_2_1_5_1","unstructured":"AMD. 2025. AMD Instinct MI325X Accelerator. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/instinct-tech-docs\/product-briefs\/instinct-mi325x-datasheet.pdf"},{"key":"e_1_3_2_1_6_1","unstructured":"Anthropic. 2024. https:\/\/www.anthropic.com\/solutions\/agents"},{"key":"e_1_3_2_1_7_1","unstructured":"Anthropic. 2024. Introducing the Model Context Protocol. https:\/\/www.anthropic.com\/news\/model-context-protocol"},{"key":"e_1_3_2_1_8_1","unstructured":"Apoorv Saxena. 2023. Prompt Lookup Decoding. https:\/\/github.com\/apoorvumang\/prompt-lookup-decoding"},{"key":"e_1_3_2_1_9_1","unstructured":"Apple. 2010. Siri. https:\/\/www.apple.com\/siri"},{"key":"e_1_3_2_1_10_1","unstructured":"Apple. 2023. https:\/\/github.com\/ml-explore\/mlx-lm"},{"key":"e_1_3_2_1_11_1","unstructured":"Apple. 2024. Apple Intelligence. https:\/\/www.apple.com\/apple-intelligence"},{"key":"e_1_3_2_1_12_1","unstructured":"Apple. 2024. Apple Introduces M4 Pro and M4 Max. https:\/\/www.apple.com\/newsroom\/2024\/10\/apple-introduces-m4-pro-and-m4-max"},{"key":"e_1_3_2_1_13_1","unstructured":"Apple. 2024. Introducing Apple's On-device and Server Foundation Models. https:\/\/machinelearning.apple.com\/research\/introducing-apple-foundation-models"},{"key":"e_1_3_2_1_14_1","unstructured":"Apple Developer. 2025. https:\/\/developer.apple.com\/documentation\/FoundationModels"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS).","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-shot Learners. In Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_16_1","unstructured":"Charlie Chen Sebastian Borgeaud Geoffrey Irving Jean-Baptiste Lespiau Laurent Sifre and John Jumper. 2023. Accelerating Large Language Model Decoding with Speculative Sampling. In arxiv.org."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1006\/csla.1999.0128"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711875.3729128"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Lutfi Eren Erdogan Nicholas Lee Siddharth Jha Sehoon Kim Ryan Tabrizi Suhong Moon Coleman Hooper Gopala Anumanchipalli Kurt Keutzer and Amir Gholami. 2024. TinyAgent: Function Calling at the Edge. In arxiv.org.","DOI":"10.18653\/v1\/2024.emnlp-demo.9"},{"key":"e_1_3_2_1_20_1","unstructured":"Lutfi Eren Erdogan Nicholas Lee Sehoon Kim Suhong Moon Hiroki Furuta Gopala Anumanchipalli Kurt Keutzer and Amir Gholami. 2025. Plan-and-Act: Improving Planning of Agents for Long-horizon Tasks. In arxiv.org."},{"key":"e_1_3_2_1_21_1","unstructured":"Tiantian Gan and Qiyao Sun. 2025. RAG-MCP: Mitigating Prompt Bloat in LLM Tool Selection via Retrieval-augmented Generation. In arxiv.org."},{"key":"e_1_3_2_1_22_1","unstructured":"Gemini Team. 2025. Gemini 2.5: Pushing the Frontier with Advanced Reasoning Multimodality Long Context and Next Generation Agentic Capabilities. In arxiv.org."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of Machine Learning and Systems (MLSYS).","author":"Gim In","year":"2024","unstructured":"In Gim, Guojun Chen, Seung-seob Lee, Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. 2024. Prompt Cache: Modular Attention Reuse for Low-latency Inference. In Proceedings of Machine Learning and Systems (MLSYS)."},{"key":"e_1_3_2_1_24_1","unstructured":"Google. 2024. https:\/\/gemini.google\/assistant"},{"key":"e_1_3_2_1_25_1","unstructured":"Google Blog. 2024. Circle (or Highlight or Scribble) to Search. https:\/\/blog.google\/products\/search\/google-circle-to-search-android"},{"key":"e_1_3_2_1_26_1","unstructured":"Google Blog. 2025. A New Era of Intelligence with Gemini 3. https:\/\/blog.google\/products\/gemini\/gemini-3\/"},{"key":"e_1_3_2_1_27_1","unstructured":"Google Blog. 2025. Gemini CLI: Your Open-source AI Agent. https:\/\/blog.google\/technology\/developers\/introducing-gemini-cli-open-source-ai-agent\/"},{"key":"e_1_3_2_1_28_1","unstructured":"Google Cloud. 2024. TPU v6e. https:\/\/cloud.google.com\/tpu\/docs\/v6e"},{"key":"e_1_3_2_1_29_1","unstructured":"Google DeepMind. 2023. https:\/\/deepmind.google\/models\/gemini\/nano"},{"key":"e_1_3_2_1_30_1","unstructured":"Google Developers. 2025. https:\/\/ai.google.dev\/gemini-api\/docs\/function-calling"},{"key":"e_1_3_2_1_31_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey et al. 2024. The Llama 3 Herd of Models. In arxiv.org."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716267"},{"key":"e_1_3_2_1_33_1","unstructured":"Awni Hannun Jagrit Digani Angelos Katharopoulos and Ronan Collobert. 2023. https:\/\/github.com\/ml-explore\/mlx"},{"key":"e_1_3_2_1_34_1","volume-title":"EdgeLLM: A Highly Efficient CPU-FPGA Heterogeneous Edge Accelerator for Large Language Models","author":"Huang Mingqiang","year":"2025","unstructured":"Mingqiang Huang, Ao Shen, Kai Li, Haoxiang Peng, Boyu Li, Yupeng Su, and Hao Yu. 2025. EdgeLLM: A Highly Efficient CPU-FPGA Heterogeneous Edge Accelerator for Large Language Models. IEEE Transactions on Circuits and Systems I: Regular Papers (2025)."},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS).","author":"Kamath Aditya K.","year":"2025","unstructured":"Aditya K. Kamath, Ramya Prabhu, Jayashree Mohan, Simon Peter, Ramachandran Ramjee, and Ashish Panwar. 2025. POD-attention: Unlocking Full Prefill-decode Overlap for Faster LLM Inference. In Proceedings of the International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)."},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Kim Sehoon","year":"2024","unstructured":"Sehoon Kim, Suhong Moon, Ryan Tabrizi, Nicholas Lee, Michael W Mahoney, Kurt Keutzer, and Amir Gholami. 2024. An LLM Compiler for Parallel Function Calling. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_38_1","volume-title":"Learning the Parts of Objects by Non-negative Matrix Factorization. Nature 401, 6755","author":"Lee Daniel D","year":"1999","unstructured":"Daniel D Lee and H Sebastian Seung. 1999. Learning the Parts of Objects by Non-negative Matrix Factorization. Nature 401, 6755 (1999), 788\u2013791."},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. 2023. Fast Inference from Transformers via Speculative Decoding. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Li Yuhui","year":"2024","unstructured":"Yuhui Li, Fangyun Wei, Chao Zhang, and Hongyang Zhang. 2024. EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_41_1","unstructured":"Jiachang Liu Dinghan Shen Yizhe Zhang Bill Dolan Lawrence Carin and Weizhu Chen. 2021. What Makes Good In-Context Examples for GPT-3?. In arxiv.org."},{"key":"e_1_3_2_1_42_1","volume-title":"HERA: Hybrid Edge-cloud Resource Allocation for Cost-efficient AI Agents. In arxiv.org.","author":"Liu Shiyi","year":"2025","unstructured":"Shiyi Liu, Haiying Shen, Shuai Che, Mahdi Ghandi, and Mingqin Li. 2025. HERA: Hybrid Edge-cloud Resource Allocation for Cost-efficient AI Agents. In arxiv.org."},{"key":"e_1_3_2_1_43_1","unstructured":"LM Studio. 2024. https:\/\/github.com\/lmstudio-ai\/mlx-engine"},{"key":"e_1_3_2_1_44_1","unstructured":"Manus AI. 2025. Manus. https:\/\/manus.im\/"},{"key":"e_1_3_2_1_45_1","unstructured":"Meta. 2024. Llama 3.2: Revolutionizing Edge AI and Vision with Open Customizable Models. https:\/\/ai.meta.com\/blog\/llama-3-2-connect-2024-vision-edge-mobile-devices"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"e_1_3_2_1_47_1","unstructured":"Microsoft. 2024. https:\/\/www.microsoft.com\/en-us\/windows\/business\/devices\/copilot-plus-pcs"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics (NAACL).","author":"Min Sewon","year":"2022","unstructured":"Sewon Min, Mike Lewis, Luke Zettlemoyer, and Hannaneh Hajishirzi. 2022. MetalCL: Learning to Learn in Context. In Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)."},{"key":"e_1_3_2_1_49_1","unstructured":"NVIDIA. 2024. NVIDIA H100 Tensor Core GPU. https:\/\/resources.nvidia.com\/en-us-hopper-architecture\/nvidia-tensor-core-gpu-datasheet"},{"key":"e_1_3_2_1_50_1","unstructured":"NVIDIA. 2024. NVIDIA H200 Tensor Core GPU. https:\/\/nvdam.widen.net\/s\/nb5zzzsjdf\/hpc-datasheet-sc23-h200-datasheet-3002446"},{"key":"e_1_3_2_1_51_1","unstructured":"NVIDIA. 2025. NVIDIA Blackwell Architecture Technical Brief. https:\/\/resources.nvidia.com\/en-us-blackwell-architecture"},{"key":"e_1_3_2_1_52_1","unstructured":"OpenAI. 2025. Introducing ChatGPT Agent: Bridging Research and Action. https:\/\/openai.com\/index\/introducing-chatgpt-agent\/"},{"key":"e_1_3_2_1_53_1","unstructured":"OpenAI. 2025. Introducing Deep Research. https:\/\/openai.com\/index\/introducing-deep-research\/"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.23919\/DATE64628.2025.10992798"},{"key":"e_1_3_2_1_55_1","volume-title":"Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Park Yeonhong","unstructured":"Yeonhong Park, Jake Hyun, Hojoon Kim, and Jae W. Lee. 2025. DecDEC: A Systems Approach to Advancing Low-bit LLM Quantization. In Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_57_1","unstructured":"Qualcomm. 2024. https:\/\/www.qualcomm.com\/products\/mobile\/snapdragon\/laptops-and-tablets\/snapdragon-x-elite"},{"key":"e_1_3_2_1_58_1","unstructured":"Qualcomm. 2024. Hexagon NPU SDK. https:\/\/www.qualcomm.com\/developer\/software\/hexagon-npu-sdk"},{"key":"e_1_3_2_1_59_1","unstructured":"Samsung. 2017. https:\/\/www.samsung.com\/us\/apps\/bixby"},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of the International Conference on Computer-Aided Design.","author":"Sarkar Rishov","year":"2023","unstructured":"Rishov Sarkar, Hanxue Liang, Zhiwen Fan, Zhangyang Wang, and Cong Hao. 2023. Edge-MoE: Memory-efficient Multi-task Vision Transformer Architecture with Task-level Sparsity via Mixture-of-Experts. In Proceedings of the International Conference on Computer-Aided Design."},{"key":"e_1_3_2_1_61_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS).","author":"Schick Timo","year":"2023","unstructured":"Timo Schick, Jane Dwivedi-Yu, Roberto Dess\u00ec, Roberta Raileanu, Maria Lomeli, Eric Hambro, Luke Zettlemoyer, Nicola Cancedda, and Thomas Scialom. 2023. Toolformer: Language Models Can Teach Themselves to Use Tools. In Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_62_1","volume-title":"Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA).","author":"Seo Seong Hoon","unstructured":"Seong Hoon Seo, Junghoon Kim, Donghyun Lee, Seonah Yoo, Seokwon Moon, Yeonhong Park, and Jae W. Lee. 2025. FACIL: Flexible DRAM Address Mapping for SoC-PIM Cooperative On-device LLM Inference. In Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS).","author":"Shen Yongliang","year":"2023","unstructured":"Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu, and Yueting Zhuang. 2023. HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face. In Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711875.3729141"},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS).","author":"Shinn Noah","year":"2023","unstructured":"Noah Shinn, Federico Cassano, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao. 2023. Reflexion: Language Agents with Verbal Reinforcement Learning. In Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_66_1","unstructured":"Simranjit Singh Andreas Karatzas Michael Fore Iraklis Anagnostopoulos and Dimitrios Stamoulis. 2024. An LLM-tool Compiler for Fused Parallel Function Calling. In arxiv.org."},{"key":"e_1_3_2_1_67_1","unstructured":"Squeeze AI Lab. 2024. TinyAgent-7B. https:\/\/huggingface.co\/squeeze-ai-lab\/TinyAgent-7B"},{"key":"e_1_3_2_1_68_1","unstructured":"Squeeze AI Lab. 2024. TinyAgent-dataset. https:\/\/huggingface.co\/datasets\/squeeze-ai-lab\/TinyAgent-dataset"},{"key":"e_1_3_2_1_69_1","unstructured":"Squeeze AI Lab. 2024. TinyAgent-ToolRAG. https:\/\/huggingface.co\/squeeze-ai-lab\/TinyAgent-ToolRAG"},{"key":"e_1_3_2_1_70_1","volume-title":"Proceedings of the USENIX Annual Technical Conference (ATC).","author":"Tian Chunlin","year":"2025","unstructured":"Chunlin Tian, Xinpeng Qin, Kahou Tam, Li Li, Zijian Wang, Yuanzhe Zhao, Minglei Zhang, and Chengzhong Xu. 2025. CLONE: Customizing LLMs for Efficient Latency-aware Inference at the Edge. In Proceedings of the USENIX Annual Technical Conference (ATC)."},{"key":"e_1_3_2_1_71_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Timor Nadav","year":"2025","unstructured":"Nadav Timor, Jonathan Mamou, Daniel Korat, Moshe Berchansky, Gaurav Jain, Oren Pereg, Moshe Wasserblat, and David Harel. 2025. Accelerating LLM Inference with Lossless Speculative Decoding Algorithms for Heterogeneous Vocabularies. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711875.3729132"},{"key":"e_1_3_2_1_73_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS).","author":"Wang Junyang","year":"2024","unstructured":"Junyang Wang, Haiyang Xu, Haitao Jia, Xi Zhang, Ming Yan, Weizhou Shen, Ji Zhang, Fei Huang, and Jitao Sang. 2024. Mobile-agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-agent Collaboration. In Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_74_1","volume-title":"Mobile-agent: Autonomous Multi-modal Mobile Device Agent with Visual Perception. In arxiv.org.","author":"Wang Junyang","year":"2024","unstructured":"Junyang Wang, Haiyang Xu, Jiabo Ye, Ming Yan, Weizhou Shen, Ji Zhang, Fei Huang, and Jitao Sang. 2024. Mobile-agent: Autonomous Multi-modal Mobile Device Agent with Visual Perception. In arxiv.org."},{"key":"e_1_3_2_1_75_1","unstructured":"WizardLM Team. 2024. WizardLM 2. https:\/\/wizardlm.github.io\/WizardLM2"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731101"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696098"},{"key":"e_1_3_2_1_78_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR).","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao. 2023. React: Synergizing Reasoning and Acting in Language Models. In Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_79_1","volume-title":"EdgeMoE: Empowering Sparse Large Language Models on Mobile Devices","author":"Yi Rongjie","year":"2025","unstructured":"Rongjie Yi, Liwei Guo, Shiyun Wei, Ao Zhou, Shangguang Wang, and Mengwei Xu. 2025. EdgeMoE: Empowering Sparse Large Language Models on Mobile Devices. IEEE Transactions on Mobile Computing (2025)."},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00108"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713600"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.607"},{"key":"e_1_3_2_1_83_1","volume-title":"FDC: Fast KV Dimensionality Compression for Efficient LLM Inference. In arxiv.org.","author":"Zhang Zeyu","year":"2024","unstructured":"Zeyu Zhang and Haiying Shen. 2024. FDC: Fast KV Dimensionality Compression for Efficient LLM Inference. In arxiv.org."},{"key":"e_1_3_2_1_84_1","volume-title":"Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_85_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Zhou Andy","year":"2024","unstructured":"Andy Zhou, Kai Yan, Michal Shlapentokh-Rothman, Haohan Wang, and Yu-Xiong Wang. 2024. Language Agent Tree Search Unifies Reasoning, Acting, and Planning in Language Models. In Proceedings of the International Conference on Machine Learning (ICML)."}],"event":{"name":"MobiSys '26: 24th Annual International Conference on Mobile Systems, Applications and Services","location":"University of Cambridge Cambridge United Kingdom","acronym":"MobiSys '26","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 24th Annual International Conference on Mobile Systems, Applications and Services"],"original-title":[],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:56:35Z","timestamp":1780059395000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3745756.3809195"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,20]]},"references-count":85,"alternative-id":["10.1145\/3745756.3809195","10.1145\/3745756"],"URL":"https:\/\/doi.org\/10.1145\/3745756.3809195","relation":{},"subject":[],"published":{"date-parts":[[2026,6,20]]},"assertion":[{"value":"2026-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}