{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T07:54:21Z","timestamp":1780473261693,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","funder":[{"name":"National Key R&D Program of China","award":["2024YFE0200802"],"award-info":[{"award-number":["2024YFE0200802"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62293511"],"award-info":[{"award-number":["62293511"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["623B1002"],"award-info":[{"award-number":["623B1002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62293481"],"award-info":[{"award-number":["62293481"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62201505"],"award-info":[{"award-number":["62201505"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["92467301"],"award-info":[{"award-number":["92467301"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Key Research and Development Program of Zhejiang Province","award":["2025C01012"],"award-info":[{"award-number":["2025C01012"]}]},{"name":"SUTD-ZJU IDEA Grant SUTD-ZJU (VP)","award":["202102"],"award-info":[{"award-number":["202102"]}]},{"name":"ZJUCSE-Enflame Cloud and Edge Intelligence Joint Laboratory"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,3]]},"DOI":"10.1145\/3680207.3723487","type":"proceedings-article","created":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T13:19:18Z","timestamp":1763731158000},"page":"483-497","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Confidant: Customizing Transformer-based LLMs via Collaborative Training on Mobile Devices"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9905-5823","authenticated-orcid":false,"given":"Yuhao","family":"Chen","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4789-1645","authenticated-orcid":false,"given":"Yuxuan","family":"Yan","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6110-7645","authenticated-orcid":false,"given":"Shuowei","family":"Ge","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4533-6467","authenticated-orcid":false,"given":"Yuyang","family":"Qin","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0706-623X","authenticated-orcid":false,"given":"Yue","family":"Zheng","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4747-9410","authenticated-orcid":false,"given":"Qianqian","family":"Yang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1505-6766","authenticated-orcid":false,"given":"Shibo","family":"He","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9160-048X","authenticated-orcid":false,"given":"Zhiguo","family":"Shi","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3155-3145","authenticated-orcid":false,"given":"Jiming","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"},{"name":"Hangzhou Dianzi University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9542-7095","authenticated-orcid":false,"given":"Yuanchao","family":"Shu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,11,21]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Marah Abdin Jyoti Aneja Harkirat Behl S\u00e9bastien Bubeck Ronen Eldan Suriya Gunasekar Michael Harrison Russell J Hewett Mojan Javaheripi Piero Kauffmann et al. 2024. Phi-4 technical report. arXiv preprint arXiv:2412.08905 (2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"AI@Meta. 2024. Llama 3 Model Card. (2024). https:\/\/github.com\/meta-llama\/llama3\/blob\/main\/MODEL_CARD.md"},{"key":"e_1_3_2_1_3_1","volume-title":"Mohammad Rastegari, and Mehrdad Farajtabar.","author":"Alizadeh Keivan","year":"2024","unstructured":"Keivan Alizadeh, Iman Mirzadeh, Dmitry Belenko, Karen Khatamifard, Minsik Cho, Carlo C Del Mundo, Mohammad Rastegari, and Mehrdad Farajtabar. 2024. LLM in a flash: Efficient Large Language Model Inference with Limited Memory. arXiv preprint arXiv:2312.11514 (2024)."},{"key":"e_1_3_2_1_4_1","unstructured":"Android Studio. 2023. Profile battery usage with Batterystats and Battery Historian. https:\/\/developer.android.google.cn\/topic\/performance\/power\/setup-battery-historian.html. (2023). Accessed on November 19 2023."},{"key":"e_1_3_2_1_5_1","unstructured":"Angels Balaguer Vinamra Benara Renato Luiz de Freitas Cunha Roberto de M. Estev\u00e3o Filho Todd Hendry Daniel Holstein Jennifer Marsman Nick Mecklenburg Sara Malvar Leonardo O. Nunes Rafael Padilha Morris Sharp Bruno Silva Swati Sharma Vijay Aski and Ranveer Chandra. 2024. RAG vs Fine-tuning: Pipelines Tradeoffs and a Case Study on Agriculture. arXiv preprint arXiv:2401.08406 (2024)."},{"key":"e_1_3_2_1_6_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. NeurIPS (2020)."},{"key":"e_1_3_2_1_7_1","volume-title":"EEFL: High-speed wireless communications inspired energy efficient federated learning over mobile devices. ACM MobiSys","author":"Chen Rui","year":"2023","unstructured":"Rui Chen, Qiyu Wan, Xinyue Zhang, Xiaoqi Qin, Yanzhao Hou, Di Wang, Xin Fu, and Miao Pan. 2023. EEFL: High-speed wireless communications inspired energy efficient federated learning over mobile devices. ACM MobiSys (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174","author":"Chen Tianqi","year":"2016","unstructured":"Tianqi Chen, Bing Xu, Chiyuan Zhang, and Carlos Guestrin. 2016. Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174 (2016)."},{"key":"e_1_3_2_1_9_1","volume-title":"AccEPT: An Acceleration Scheme for Speeding Up Edge Pipeline-parallel Training","author":"Chen Yuhao","year":"2024","unstructured":"Yuhao Chen, Yuxuan Yan, Qianqian Yang, Yuanchao Shu, Shibo He, Zhiguo Shi, and Jiming Chen. 2024. AccEPT: An Acceleration Scheme for Speeding Up Edge Pipeline-parallel Training. IEEE Transactions on Mobile Computing (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"FTPipeHD: A Fault-Tolerant Pipeline-Parallel Distributed Training Approach for Heterogeneous Edge Devices","author":"Chen Yuhao","year":"2023","unstructured":"Yuhao Chen, Qianqian Yang, Shibo He, Zhiguo Shi, Jiming Chen, and Mohsen Guizani. 2023. FTPipeHD: A Fault-Tolerant Pipeline-Parallel Distributed Training Approach for Heterogeneous Edge Devices. IEEE Transactions on Mobile Computing (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Performance-preserving {DNN} Training Framework for Saving Energy. USENIX ATC","author":"Choi Sangjin","year":"2023","unstructured":"Sangjin Choi, Inhoe Koo, Jeongseob Ahn, Myeongjae Jeon, and Youngjin Kwon. 2023. {EnvPipe}: Performance-preserving {DNN} Training Framework for Saving Energy. USENIX ATC (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"Qlora: Efficient finetuning of quantized llms. NeurIPS","author":"Dettmers Tim","year":"2024","unstructured":"Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. 2024. Qlora: Efficient finetuning of quantized llms. NeurIPS (2024)."},{"key":"e_1_3_2_1_13_1","unstructured":"ONNX Runtime developers. 2021. ONNX Runtime. https:\/\/onnxruntime.ai\/. (2021)."},{"key":"e_1_3_2_1_14_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_15_1","volume-title":"Reducing transformer depth on demand with structured dropout. ICLR","author":"Fan Angela","year":"2020","unstructured":"Angela Fan, Edouard Grave, and Armand Joulin. 2020. Reducing transformer depth on demand with structured dropout. ICLR (2020)."},{"key":"e_1_3_2_1_16_1","unstructured":"Forbes. 2023. What Large Models Cost You - There Is No Free AI Lunch. https:\/\/www.forbes.com\/sites\/craigsmith\/2023\/09\/08\/what-large-models-cost-you-there-is-no-free-ai-lunch. (2023). Accessed on March 11 2024."},{"key":"e_1_3_2_1_17_1","volume-title":"Memory-efficient DNN training on mobile devices. ACM MobiSys","author":"Gim In","year":"2022","unstructured":"In Gim and JeongGil Ko. 2022. Memory-efficient DNN training on mobile devices. ACM MobiSys (2022)."},{"key":"e_1_3_2_1_18_1","unstructured":"gRPC developers. 2024. gRPC. https:\/\/grpc.io. (2024)."},{"key":"e_1_3_2_1_19_1","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)."},{"key":"e_1_3_2_1_20_1","volume-title":"Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly.","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. ICML (2019)."},{"key":"e_1_3_2_1_21_1","volume-title":"Lora: Low-rank adaptation of large language models. ICLR","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. Lora: Low-rank adaptation of large language models. ICLR (2022)."},{"key":"e_1_3_2_1_22_1","volume-title":"Codl: efficient cpu-gpu co-execution for deep learning inference on mobile devices. ACM MobiSys","author":"Jia Fucheng","year":"2022","unstructured":"Fucheng Jia, Deyu Zhang, Ting Cao, Shiqi Jiang, Yunxin Liu, Ju Ren, and Yaoxue Zhang. 2022. Codl: efficient cpu-gpu co-execution for deep learning inference on mobile devices. ACM MobiSys (2022)."},{"key":"e_1_3_2_1_23_1","volume-title":"Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al.","author":"Jiang Albert Q","year":"2024","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al. 2024. Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Profiling and optimizing deep learning inference on mobile GPUs. ACM APSys","author":"Jiang Shiqi","year":"2020","unstructured":"Shiqi Jiang, Lihao Ran, Ting Cao, Yusen Xu, and Yunxin Liu. 2020. Profiling and optimizing deep learning inference on mobile GPUs. ACM APSys (2020)."},{"key":"e_1_3_2_1_25_1","volume-title":"MNN: A Universal and Efficient Inference Engine. MLSys","author":"Jiang Xiaotang","year":"2020","unstructured":"Xiaotang Jiang, Huan Wang, Yiliu Chen, Ziqi Wu, Lichuan Wang, Bin Zou, Yafeng Yang, Zongyang Cui, Yu Cai, Tianhang Yu, Chengfei Lv, and Zhihua Wu. 2020. MNN: A Universal and Efficient Inference Engine. MLSys (2020)."},{"key":"e_1_3_2_1_26_1","volume-title":"A review of on-device fully neural end-to-end automatic speech recognition algorithms","author":"Kim Chanwoo","year":"2020","unstructured":"Chanwoo Kim, Dhananjaya Gowda, Dongsoo Lee, Jiyeon Kim, Ankur Kumar, Sungsoo Kim, Abhinav Garg, and Changwoo Han. 2020. A review of on-device fully neural end-to-end automatic speech recognition algorithms. IEEE ACSSC (2020)."},{"key":"e_1_3_2_1_27_1","volume-title":"\u03bclayer: Low latency on-device inference using cooperative single-layer acceleration and processor-friendly quantization. ACM EuroSys","author":"Kim Youngsok","year":"2019","unstructured":"Youngsok Kim, Joonsung Kim, Dongju Chae, Daehyun Kim, and Jangwoo Kim. 2019. \u03bclayer: Low latency on-device inference using cooperative single-layer acceleration and processor-friendly quantization. ACM EuroSys (2019)."},{"key":"e_1_3_2_1_28_1","volume-title":"Ananda Theertha Suresh, and Dave Bacon","author":"Kone\u010dn\u1ef3 Jakub","year":"2016","unstructured":"Jakub Kone\u010dn\u1ef3, H Brendan McMahan, Felix X Yu, Peter Richt\u00e1rik, Ananda Theertha Suresh, and Dave Bacon. 2016. Federated Learning: Strategies for Improving Communication Efficiency. arXiv preprint arXiv:1610.05492 (2016)."},{"key":"e_1_3_2_1_29_1","volume-title":"Reducing activation recomputation in large transformer models. MLSys","author":"Korthikanti Vijay Anand","year":"2023","unstructured":"Vijay Anand Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. 2023. Reducing activation recomputation in large transformer models. MLSys (2023)."},{"key":"e_1_3_2_1_30_1","unstructured":"Patrick Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. NeurIPS (2020)."},{"key":"e_1_3_2_1_31_1","volume-title":"Respipe: Resilient model-distributed dnn training at edge networks","author":"Li Pengzhen","year":"2021","unstructured":"Pengzhen Li, Erdem Koyuncu, and Hulya Seferoglu. 2021. Respipe: Resilient model-distributed dnn training at edge networks. IEEE ICASSP (2021)."},{"key":"e_1_3_2_1_32_1","volume-title":"Terapipe: Token-level pipeline parallelism for training large-scale language models. PMLR.","author":"Li Zhuohan","year":"2021","unstructured":"Zhuohan Li, Siyuan Zhuang, Shiyuan Guo, Danyang Zhuo, Hao Zhang, Dawn Song, and Ion Stoica. 2021. Terapipe: Token-level pipeline parallelism for training large-scale language models. PMLR."},{"key":"e_1_3_2_1_33_1","unstructured":"Zichang Liu Jue Wang Tri Dao Tianyi Zhou Binhang Yuan Zhao Song Anshumali Shrivastava Ce Zhang Yuandong Tian Christopher Re et al. 2023. Deja vu: Contextual sparsity for efficient llms at inference time. ICML (2023)."},{"key":"e_1_3_2_1_34_1","unstructured":"llama.cpp team. 2023\u20132025. llama.cpp. (2023\u20132025). https:\/\/github.com\/ggerganov\/llama.cpp"},{"key":"e_1_3_2_1_35_1","unstructured":"Meta Platforms Inc. 2024. Building Meta's GenAI Infrastructure. https:\/\/engineering.fb.com\/2024\/03\/12\/data-center-engineering\/building-metas-genai-infrastructure\/. (2024)."},{"key":"e_1_3_2_1_36_1","unstructured":"Microsoft. 2024. Compare gRPC services with HTTP APIs. https:\/\/learn.microsoft.com\/en-us\/aspnet\/core\/grpc\/comparison?view=aspnetcore-8.0. (2024)."},{"key":"e_1_3_2_1_37_1","unstructured":"MLC team. 2023\u20132025. MLC-LLM. (2023\u20132025). https:\/\/github.com\/mlc-ai\/mlc-llm"},{"key":"e_1_3_2_1_38_1","volume-title":"PipeDream: Generalized pipeline parallelism for DNN training. SOSP","author":"Narayanan Deepak","year":"2019","unstructured":"Deepak Narayanan, Aaron Harlap, Amar Phanishayee, Vivek Seshadri, Nikhil R Devanur, Gregory R Ganger, Phillip B Gibbons, and Matei Zaharia. 2019. PipeDream: Generalized pipeline parallelism for DNN training. SOSP (2019)."},{"key":"e_1_3_2_1_39_1","volume-title":"Fitzek","author":"P\u00e9ter Vingelmann NVIDIA","year":"2020","unstructured":"NVIDIA, P\u00e9ter Vingelmann, and Frank H.P. Fitzek. 2020. CUDA, release: 10.2.89. (2020). https:\/\/developer.nvidia.com\/cuda-toolkit"},{"key":"e_1_3_2_1_40_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"PipeFisher: Efficient Training of Large Language Models Using Pipelining and Fisher Information Matrices. MLSys","author":"Osawa Kazuki","year":"2023","unstructured":"Kazuki Osawa, Shigang Li, and Torsten Hoefler. 2023. PipeFisher: Efficient Training of Large Language Models Using Pipelining and Fisher Information Matrices. MLSys (2023)."},{"key":"e_1_3_2_1_42_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAIblog (2019)."},{"key":"e_1_3_2_1_43_1","volume-title":"Know what you don't know: Unanswerable questions for SQuAD. ACL","author":"Rajpurkar Pranav","year":"2018","unstructured":"Pranav Rajpurkar, Robin Jia, and Percy Liang. 2018. Know what you don't know: Unanswerable questions for SQuAD. ACL (2018)."},{"key":"e_1_3_2_1_44_1","volume-title":"A Fine-tuning Enhanced RAG System with Quantized Influence Measure as AI Judge. arXiv preprint arXiv:2402.17081","author":"Rangan Keshav","year":"2024","unstructured":"Keshav Rangan and Yiqiao Yin. 2024. A Fine-tuning Enhanced RAG System with Quantized Influence Measure as AI Judge. arXiv preprint arXiv:2402.17081 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"Introduction to the CoNLL-2003 shared task: Language-independent named entity recognition. HLT-NAACL","author":"Sang Erik F","year":"2003","unstructured":"Erik F Sang and Fien De Meulder. 2003. Introduction to the CoNLL-2003 shared task: Language-independent named entity recognition. HLT-NAACL (2003)."},{"key":"e_1_3_2_1_46_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)."},{"key":"e_1_3_2_1_47_1","volume-title":"Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_48_1","volume-title":"Improving the Domain Adaptation of Retrieval Augmented Generation (RAG) Models for Open Domain Question Answering. Transactions of the Association for Computational Linguistics","author":"Siriwardhana Shamane","year":"2023","unstructured":"Shamane Siriwardhana, Rivindu Weerasekera, Elliott Wen, Tharindu Kaluarachchi, Rajib Rana, and Suranga Nanayakkara. 2023. Improving the Domain Adaptation of Retrieval Augmented Generation (RAG) Models for Open Domain Question Answering. Transactions of the Association for Computational Linguistics (2023)."},{"key":"e_1_3_2_1_49_1","volume-title":"Mobilebert: a compact task-agnostic bert for resource-limited devices. ACL","author":"Sun Zhiqing","year":"2020","unstructured":"Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. 2020. Mobilebert: a compact task-agnostic bert for resource-limited devices. ACL (2020)."},{"key":"e_1_3_2_1_50_1","volume-title":"Hashimoto","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. 2023. Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca. (2023)."},{"key":"e_1_3_2_1_51_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"Federated evaluation of on-device personalization. arXiv preprint arXiv:1910.10252","author":"Wang Kangkang","year":"2019","unstructured":"Kangkang Wang, Rajiv Mathews, Chlo\u00e9 Kiddon, Hubert Eichner, Fran\u00e7oise Beaufays, and Daniel Ramage. 2019. Federated evaluation of on-device personalization. arXiv preprint arXiv:1910.10252 (2019)."},{"key":"e_1_3_2_1_53_1","volume-title":"Melon: Breaking the memory wall for resource-efficient on-device machine learning. ACM MobiSys","author":"Wang Qipeng","year":"2022","unstructured":"Qipeng Wang, Mengwei Xu, Chao Jin, Xinran Dong, Jinliang Yuan, Xin Jin, Gang Huang, Yunxin Liu, and Xuanzhe Liu. 2022. Melon: Breaking the memory wall for resource-efficient on-device machine learning. ACM MobiSys (2022)."},{"key":"e_1_3_2_1_54_1","volume-title":"OPTiC: Optimizing collaborative CPU-GPU computing on mobile devices with thermal constraints","author":"Wang Siqi","year":"2018","unstructured":"Siqi Wang, Gayathri Ananthanarayanan, and Tulika Mitra. 2018. OPTiC: Optimizing collaborative CPU-GPU computing on mobile devices with thermal constraints. IEEE transactions on computer-aided design of integrated circuits and systems (2018)."},{"key":"e_1_3_2_1_55_1","volume-title":"NN-Stretch: Automatic Neural Network Branching for Parallel Inference on Heterogeneous Multi-Processors. ACM MobiSys","author":"Wei Jianyu","year":"2023","unstructured":"Jianyu Wei, Ting Cao, Shijie Cao, Shiqi Jiang, Shaowei Fu, Mao Yang, Yanyong Zhang, and Yunxin Liu. 2023. NN-Stretch: Automatic Neural Network Branching for Parallel Inference on Heterogeneous Multi-Processors. ACM MobiSys (2023)."},{"key":"e_1_3_2_1_56_1","volume-title":"Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu.","author":"Wen Hao","year":"2024","unstructured":"Hao Wen, Yuanchun Li, Guohong Liu, Shanhui Zhao, Tao Yu, Toby Jia-Jun Li, Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu. 2024. Autodroid: Llm-powered task automation in android. ACM MobiCom (2024)."},{"key":"e_1_3_2_1_57_1","volume-title":"The Free Encyclopedia. https:\/\/en.wikipedia.org\/w\/index.php?title=Java_Native_Interface&oldid=1166267665.","author":"Wikipedia","year":"2023","unstructured":"Wikipedia contributors. 2023. Java Native Interface \u2014 Wikipedia, The Free Encyclopedia. https:\/\/en.wikipedia.org\/w\/index.php?title=Java_Native_Interface&oldid=1166267665. (2023). [Online; accessed 3-October-2023]."},{"key":"e_1_3_2_1_58_1","volume-title":"Smoothquant: Accurate and efficient post-training quantization for large language models. ICML","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. ICML (2023)."},{"key":"e_1_3_2_1_59_1","volume-title":"FwdLLM: Efficient Federated Finetuning of Large Language Models with Perturbed Inferences. USENIX ATC","author":"Xu Mengwei","year":"2024","unstructured":"Mengwei Xu, Dongqi Cai, Yaozong Wu, Xiang Li, and Shangguang Wang. 2024. FwdLLM: Efficient Federated Finetuning of Large Language Models with Perturbed Inferences. USENIX ATC (2024)."},{"key":"e_1_3_2_1_60_1","volume-title":"Asteroid: Resource-Efficient Hybrid Pipeline Parallelism for Collaborative DNN Training on Heterogeneous Edge Devices. ACM MobiCom","author":"Ye Shengyuan","year":"2024","unstructured":"Shengyuan Ye, Liekang Zeng, Xiaowen Chu, Guoliang Xing, and Xu Chen. 2024. Asteroid: Resource-Efficient Hybrid Pipeline Parallelism for Collaborative DNN Training on Heterogeneous Edge Devices. ACM MobiCom (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"Edgemoe: Fast on-device inference of moe-based large language models. arXiv preprint arXiv:2308.14352","author":"Yi Rongjie","year":"2023","unstructured":"Rongjie Yi, Liwei Guo, Shiyun Wei, Ao Zhou, Shangguang Wang, and Mengwei Xu. 2023. Edgemoe: Fast on-device inference of moe-based large language models. arXiv preprint arXiv:2308.14352 (2023)."},{"key":"e_1_3_2_1_62_1","volume-title":"Nn-meter: Towards accurate latency prediction of deep-learning model inference on diverse edge devices. ACM MobiSys","author":"Zhang Li Lyna","year":"2021","unstructured":"Li Lyna Zhang, Shihao Han, Jianyu Wei, Ningxin Zheng, Ting Cao, Yuqing Yang, and Yunxin Liu. 2021. Nn-meter: Towards accurate latency prediction of deep-learning model inference on diverse edge devices. ACM MobiSys (2021)."},{"key":"e_1_3_2_1_63_1","volume-title":"A review on edge large language models: Design, execution, and applications. Comput. Surveys","author":"Zheng Yue","year":"2024","unstructured":"Yue Zheng, Yuhao Chen, Bin Qian, Xiufang Shi, Yuanchao Shu, and Jiming Chen. 2024. A review on edge large language models: Design, execution, and applications. Comput. Surveys (2024)."},{"key":"e_1_3_2_1_64_1","volume-title":"LLaVA-\u03a6: Efficient Multi-Modal Assistant with Small Language Model. arXiv preprint arXiv:2401.02330","author":"Zhu Yichen","year":"2024","unstructured":"Yichen Zhu, Minjie Zhu, Ning Liu, Zhicai Ou, Xiaofeng Mou, and Jian Tang. 2024. LLaVA-\u03a6: Efficient Multi-Modal Assistant with Small Language Model. arXiv preprint arXiv:2401.02330 (2024)."}],"event":{"name":"ACM MOBICOM '25: 31st Annual International Conference on Mobile Computing and Networking","location":"Kerry Hotel, Hong Kong Hong Kong China","acronym":"ACM MOBICOM '25","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing"]},"container-title":["Proceedings of the 31st Annual International Conference on Mobile Computing and Networking"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3680207.3723487","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T13:25:18Z","timestamp":1763731518000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680207.3723487"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,3]]},"references-count":64,"alternative-id":["10.1145\/3680207.3723487","10.1145\/3680207"],"URL":"https:\/\/doi.org\/10.1145\/3680207.3723487","relation":{},"subject":[],"published":{"date-parts":[[2025,11,3]]},"assertion":[{"value":"2025-11-21","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}