{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T14:01:43Z","timestamp":1780063303903,"version":"3.54.0"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T00:00:00Z","timestamp":1781913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/legalcode"}],"funder":[{"name":"Science and Technology Development Fund of Macau","award":["0107\/2024\/RIA2"],"award-info":[{"award-number":["0107\/2024\/RIA2"]}]},{"name":"Science and Technology Development Fund of Macau","award":["0061\/2025\/RIB2"],"award-info":[{"award-number":["0061\/2025\/RIB2"]}]},{"name":"Joint Science and Technology Research Project with Hong Kong and Macau in Key Areas of Nansha District's Science and Technology Plan","award":["EF2024-00180-IOTSC"],"award-info":[{"award-number":["EF2024-00180-IOTSC"]}]},{"name":"Multi-Year Research Grant of University of Macau","award":["MYRG-GRG2023-00211-IOTSC-UMDF"],"award-info":[{"award-number":["MYRG-GRG2023-00211-IOTSC-UMDF"]}]},{"name":"Multi-Year Research Grant of University of Macau","award":["MYRG-GRG2024-00180-IOTSC"],"award-info":[{"award-number":["MYRG-GRG2024-00180-IOTSC"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,21]]},"DOI":"10.1145\/3745756.3809214","type":"proceedings-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:52:21Z","timestamp":1780059141000},"page":"432-447","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["FBLayout: Optimizing Memory Layout for Efficient LLM Finetuning on Mobile GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5816-6837","authenticated-orcid":false,"given":"Kahou","family":"Tam","sequence":"first","affiliation":[{"name":"State Key Laboratory of IoTSC, University of Macau, Macau, Macao"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2697-7042","authenticated-orcid":false,"given":"Wei","family":"Niu","sequence":"additional","affiliation":[{"name":"University of Georgia, Athens, GA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3744-0580","authenticated-orcid":false,"given":"Yu","family":"Bao","sequence":"additional","affiliation":[{"name":"State Key Laboratory of IoTSC, University of Macau, Macau, Macao"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0710-0963","authenticated-orcid":false,"given":"Xiaomin","family":"Ouyang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9480-0356","authenticated-orcid":false,"given":"ChengZhong","family":"Xu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of IoTSC, University of Macau, Macau, Macao"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2044-8289","authenticated-orcid":false,"given":"Li","family":"Li","sequence":"additional","affiliation":[{"name":"State Key Laboratory of IoTSC, University of Macau, Macau, Macao"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Adreno. [n. d.]. Adreno wikipedia. https:\/\/en.wikipedia.org\/wiki\/Adreno."},{"key":"e_1_3_2_1_2_1","volume-title":"Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai.","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel De Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. 2023. Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245 (2023)."},{"key":"e_1_3_2_1_3_1","first-page":"03","article-title":"HInside NVIDIA Blackwell Ultra: The Chip Powering the AI Factory Era. https:\/\/developer.nvidia.com\/blog\/inside-nvidia-blackwell-ultra-the-chip-powering-the-ai-factory-era\/","volume":"2026","author":"Aubrey Kyle","year":"2025","unstructured":"Kyle Aubrey and Nick Stam. 2025. HInside NVIDIA Blackwell Ultra: The Chip Powering the AI Factory Era. https:\/\/developer.nvidia.com\/blog\/inside-nvidia-blackwell-ultra-the-chip-powering-the-ai-factory-era\/. Accessed: 2026.03.","journal-title":"Accessed"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision. 357\u2013366","author":"Richard Chen Chun-Fu","year":"2021","unstructured":"Chun-Fu Richard Chen, Quanfu Fan, and Rameswar Panda. 2021. Crossvit: Cross-attention multi-scale vision transformer for image classification. In Proceedings of the IEEE\/CVF international conference on computer vision. 357\u2013366."},{"key":"e_1_3_2_1_5_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. 2018. {TVM}: An automated {End-to-End} optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578\u2013594."},{"key":"e_1_3_2_1_6_1","volume-title":"Vision transformer adapter for dense predictions. arXiv preprint arXiv:2205.08534","author":"Chen Zhe","year":"2022","unstructured":"Zhe Chen, Yuchen Duan, Wenhai Wang, Junjun He, Tong Lu, Jifeng Dai, and Yu Qiao. 2022. Vision transformer adapter for dense predictions. arXiv preprint arXiv:2205.08534 (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. 2023. Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"8-bit optimizers via block-wise quantization. arXiv preprint arXiv:2110.02861","author":"Dettmers Tim","year":"2021","unstructured":"Tim Dettmers, Mike Lewis, Sam Shleifer, and Luke Zettlemoyer. 2021. 8-bit optimizers via block-wise quantization. arXiv preprint arXiv:2110.02861 (2021)."},{"key":"e_1_3_2_1_9_1","volume-title":"Qlora: Efficient finetuning of quantized llms. Advances in neural information processing systems 36","author":"Dettmers Tim","year":"2023","unstructured":"Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. 2023. Qlora: Efficient finetuning of quantized llms. Advances in neural information processing systems 36 (2023), 10088\u201310115."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171\u20134186."},{"key":"e_1_3_2_1_11_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16\u00d716 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_12_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv e-prints (2024) arXiv-2407."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3498361.3539765"},{"key":"e_1_3_2_1_14_1","volume-title":"https:\/\/www.tensorflow.org\/lite. Accessed","author":"Lite TensorFlow","year":"2025","unstructured":"Google. 2025. TensorFlow Lite. https:\/\/www.tensorflow.org\/lite. Accessed: 2025.9."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 39th ACM International Conference on Supercomputing. 205\u2013220","author":"Guan Jiexiong","year":"2025","unstructured":"Jiexiong Guan, Zhenqing Hu, Christos D Antonopoulos, Nikolaos Bellas, Spyros Lalis, Evgenia Smirni, Gang Zhou, Gagan Agrawal, and Bin Ren. 2025. TM-Model: Modeling Texture Memory and Mobile GPU Performance to Accelerate DNN Computations. In Proceedings of the 39th ACM International Conference on Supercomputing. 205\u2013220."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 24th annual international symposium on Computer architecture. 108\u2013120","author":"Hakura Ziyad S","year":"1997","unstructured":"Ziyad S Hakura and Anoop Gupta. 1997. The design and analysis of a cache architecture for texture mapping. In Proceedings of the 24th annual international symposium on Computer architecture. 108\u2013120."},{"key":"e_1_3_2_1_17_1","volume-title":"Scaling llm test-time compute with mobile npu on smartphones. arXiv preprint arXiv:2509.23324","author":"Hao Zixu","year":"2025","unstructured":"Zixu Hao, Jianyu Wei, Tuowei Wang, Minxing Huang, Huiqiang Jiang, Shiqi Jiang, Ting Cao, and Ju Ren. 2025. Scaling llm test-time compute with mobile npu on smartphones. arXiv preprint arXiv:2509.23324 (2025)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_19_1","volume-title":"International conference on machine learning. PMLR, 2790\u20132799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In International conference on machine learning. PMLR, 2790\u20132799."},{"key":"e_1_3_2_1_20_1","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al. 2022. Lora: Low-rank adaptation of large language models. ICLR 1, 2 (2022), 3.","journal-title":"ICLR"},{"key":"e_1_3_2_1_21_1","volume-title":"Test-time learning for large language models. arXiv preprint arXiv:2505.20633","author":"Hu Jinwu","year":"2025","unstructured":"Jinwu Hu, Zhitian Zhang, Guohao Chen, Xutao Wen, Chao Shuai, Wei Luo, Bin Xiao, Yuanqing Li, and Mingkui Tan. 2025. Test-time learning for large language models. arXiv preprint arXiv:2505.20633 (2025)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581791.3596852"},{"key":"e_1_3_2_1_23_1","first-page":"1","article-title":"MNN: A universal and efficient inference engine","volume":"2","author":"Jiang Xiaotang","year":"2020","unstructured":"Xiaotang Jiang, Huan Wang, Yiliu Chen, Ziqi Wu, Lichuan Wang, Bin Zou, Yafeng Yang, Zongyang Cui, Yu Cai, Tianhang Yu, et al. 2020. MNN: A universal and efficient inference engine. Proceedings of Machine Learning and Systems 2 (2020), 1\u201313.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01343"},{"key":"e_1_3_2_1_25_1","volume-title":"MobiLLM: Enabling LLM Fine-Tuning on the Mobile Device via Server Assisted Side Tuning. arXiv preprint arXiv:2502.20421","author":"Li Liang","year":"2025","unstructured":"Liang Li, Xingke Yang, Wen Wu, Hao Wang, Tomoaki Ohtsuki, Xin Fu, Miao Pan, and Xuemin Shen. 2025. MobiLLM: Enabling LLM Fine-Tuning on the Mobile Device via Server Assisted Side Tuning. arXiv preprint arXiv:2502.20421 (2025)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3662006.3662059"},{"key":"e_1_3_2_1_27_1","volume-title":"Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190","author":"Li Xiang Lisa","year":"2021","unstructured":"Xiang Lisa Li and Percy Liang. 2021. Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190 (2021)."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 28th Annual International Conference on Mobile Computing And Networking. 487\u2013500","author":"Liang Rendong","year":"2022","unstructured":"Rendong Liang, Ting Cao, Jicheng Wen, Manni Wang, Yang Wang, Jianhua Zou, and Yunxin Liu. 2022. Romou: Rapidly generate high-performance tensor kernels for mobile gpus. In Proceedings of the 28th Annual International Conference on Mobile Computing And Networking. 487\u2013500."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3678004"},{"key":"e_1_3_2_1_30_1","volume-title":"DAF: An Efficient End-to-End Dynamic Activation Framework for on-Device DNN Training. arXiv preprint arXiv:2507.07149","author":"Liu Renyuan","year":"2025","unstructured":"Renyuan Liu, Yuyang Leng, Kaiyan Liu, Shaohan Hu, Peijun Zhao, Heechul Yun, Shuochao Yao, et al. 2025. DAF: An Efficient End-to-End Dynamic Activation Framework for on-Device DNN Training. arXiv preprint arXiv:2507.07149 (2025)."},{"key":"e_1_3_2_1_31_1","unstructured":"LLVM. [n. d.]. LoopStrengthReduce. https:\/\/llvm.org\/doxygen\/LoopStrengthReduce_8cpp_source.html."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/HCS59251.2023.10254715"},{"key":"e_1_3_2_1_33_1","unstructured":"MNN. [n. d.]. MNN GitHub Repository: BFC Allocator. https:\/\/github.com\/alibaba\/MNN\/blob\/2.3.0\/source\/core\/BufferAllocator.cpp."},{"key":"e_1_3_2_1_34_1","first-page":"03","article-title":"High voltage power moniter. https:\/\/www.msoon.com\/","volume":"2026","author":"Solutions Inc Monsoon","year":"2023","unstructured":"Inc Monsoon Solutions. 2023. High voltage power moniter. https:\/\/www.msoon.com\/. Accessed: 2026.03.","journal-title":"Accessed"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651384"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606763"},{"key":"e_1_3_2_1_38_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_39_1","unstructured":"Qualcomm. [n. d.]. Snapdragon profiler. https:\/\/developer.qualcomm.com\/software\/snapdragon-profiler."},{"key":"e_1_3_2_1_40_1","volume-title":"International conference on machine learning. PMLR, 28492\u201328518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492\u201328518."},{"key":"e_1_3_2_1_41_1","unstructured":"Josh Render. [n. d.]. Android phones are now selling with a crazy 24GB of RAM \u2014 buy or skip? https:\/\/www.tomsguide.com\/phones\/android-phones\/android-phones-are-now-selling-with-a-crazy-24gb-of-ram-buy-or-skip. Accessed: 2025.9."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"e_1_3_2_1_44_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_45_1","volume-title":"Federated noisy client learning","author":"Tam Kahou","year":"2023","unstructured":"Kahou Tam, Li Li, Bo Han, Chengzhong Xu, and Huazhu Fu. 2023. Federated noisy client learning. IEEE transactions on neural networks and learning systems 36, 1 (2023), 1799\u20131812."},{"key":"e_1_3_2_1_46_1","volume-title":"ECAI","author":"Tam Kahou","year":"2023","unstructured":"Kahou Tam, Li Li, Yan Zhao, and Chengzhong Xu. 2023. Fedcoop: Cooperative federated learning for noisy labels. In ECAI 2023. IOS Press, 2298\u20132306."},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the 22nd ACM Conference on Embedded Networked Sensor Systems. 394\u2013408","author":"Tam Kahou","year":"2024","unstructured":"Kahou Tam, Chunlin Tian, Li Li, Haikai Zhao, and ChengZhong Xu. 2024. Fed-Hybrid: Breaking the Memory Wall of Federated Learning via Hybrid Tensor Management. In Proceedings of the 22nd ACM Conference on Embedded Networked Sensor Systems. 394\u2013408."},{"key":"e_1_3_2_1_48_1","volume-title":"Cassidy Hardin, Surya Bhupatiraju, L\u00e9onard Hussenot, Thomas Mesnard, Bobak Shahriari, Alexandre Ram\u00e9, et al.","author":"Team Gemma","year":"2024","unstructured":"Gemma Team, Morgane Riviere, Shreya Pathak, Pier Giuseppe Sessa, Cassidy Hardin, Surya Bhupatiraju, L\u00e9onard Hussenot, Thomas Mesnard, Bobak Shahriari, Alexandre Ram\u00e9, et al. 2024. Gemma 2: Improving open language models at a practical size. arXiv preprint arXiv:2408.00118 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"2022 55th IEEE\/ACM International Symposium on Microarchitecture (MICRO). IEEE, 631\u2013645","author":"Tian Chunlin","year":"2022","unstructured":"Chunlin Tian, Li Li, Zhan Shi, Jun Wang, and ChengZhong Xu. 2022. Harmony: Heterogeneity-aware hierarchical management for federated learning system. In 2022 55th IEEE\/ACM International Symposium on Microarchitecture (MICRO). IEEE, 631\u2013645."},{"key":"e_1_3_2_1_50_1","volume-title":"2025 USENIX Annual Technical Conference (USENIX ATC 25)","author":"Tian Chunlin","year":"2025","unstructured":"Chunlin Tian, Xinpeng Qin, Kahou Tam, Li Li, Zijian Wang, Yuanzhe Zhao, Minglei Zhang, and Chengzhong Xu. 2025. {CLONE}: Customizing {LLMs} for Efficient {Latency-Aware} Inference at the Edge. In 2025 USENIX Annual Technical Conference (USENIX ATC 25). 563\u2013585."},{"key":"e_1_3_2_1_51_1","first-page":"9565","article-title":"Hydralora: An asymmetric lora architecture for efficient fine-tuning","volume":"37","author":"Tian Chunlin","year":"2024","unstructured":"Chunlin Tian, Zhan Shi, Zhijiang Guo, Li Li, and Chengzhong Xu. 2024. Hydralora: An asymmetric lora architecture for efficient fine-tuning. Advances in Neural Information Processing Systems 37 (2024), 9565\u20139584.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_52_1","volume-title":"Floe: Federated Specialization for Real-Time LLM-SLM Inference","author":"Tian Chunlin","year":"2026","unstructured":"Chunlin Tian, Kahou Tam, Yebo Wu, Shuaihang Zhong, Li Li, Nicholas D Lane, and ChengZhong Xu. 2026. Floe: Federated Specialization for Real-Time LLM-SLM Inference. IEEE Transactions on Parallel and Distributed Systems (2026)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.780"},{"key":"e_1_3_2_1_54_1","volume-title":"Two tales of persona in llms: A survey of role-playing and personalization. arXiv preprint arXiv:2406.01171","author":"Tseng Yu-Min","year":"2024","unstructured":"Yu-Min Tseng, Yu-Chao Huang, Teng-Yun Hsiao, Wei-Lin Chen, Chao-Wei Huang, Yu Meng, and Yun-Nung Chen. 2024. Two tales of persona in llms: A survey of role-playing and personalization. arXiv preprint arXiv:2406.01171 (2024)."},{"key":"e_1_3_2_1_55_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of ACM MobiSys. 450\u2013463","author":"Wang Qipeng","year":"2022","unstructured":"Qipeng Wang, Mengwei Xu, Chao Jin, Xinran Dong, Jinliang Yuan, Xin Jin, Gang Huang, Yunxin Liu, and Xuanzhe Liu. 2022. Melon: Breaking the memory wall for resource-efficient on-device machine learning. In Proceedings of ACM MobiSys. 450\u2013463."},{"key":"e_1_3_2_1_57_1","volume-title":"Texture mapping unit. https:\/\/en.wikipedia.org\/wiki\/Texture_mapping_unit. Accessed","year":"2025","unstructured":"wiki. 2025. Texture mapping unit. https:\/\/en.wikipedia.org\/wiki\/Texture_mapping_unit. Accessed: 2025.9."},{"key":"e_1_3_2_1_58_1","volume-title":"Elastic Mixture of Rank-Wise Experts for Knowledge Reuse in Federated Fine-Tuning. arXiv preprint arXiv:2512.00902","author":"Wu Yebo","year":"2025","unstructured":"Yebo Wu, Jingguang Li, Zhijiang Guo, and Li Li. 2025. Elastic Mixture of Rank-Wise Experts for Knowledge Reuse in Federated Fine-Tuning. arXiv preprint arXiv:2512.00902 (2025)."},{"key":"e_1_3_2_1_59_1","volume-title":"Memory-Efficient Federated Fine-Tuning of Large Language Models via Layer Pruning. arXiv preprint arXiv:2508.17209","author":"Wu Yebo","year":"2025","unstructured":"Yebo Wu, Jingguang Li, Chunlin Tian, Zhijiang Guo, and Li Li. 2025. Memory-Efficient Federated Fine-Tuning of Large Language Models via Layer Pruning. arXiv preprint arXiv:2508.17209 (2025)."},{"key":"e_1_3_2_1_60_1","volume-title":"Beyond End-to-End: Dynamic Chain Optimization for Private LLM Adaptation on the Edge. arXiv preprint arXiv:2604.06819","author":"Wu Yebo","year":"2026","unstructured":"Yebo Wu, Jingguang Li, Chunlin Tian, Kahou Tam, Zhijiang Guo, and Li Li. 2026. Beyond End-to-End: Dynamic Chain Optimization for Private LLM Adaptation on the Edge. arXiv preprint arXiv:2604.06819 (2026)."},{"key":"e_1_3_2_1_61_1","volume-title":"Bridging Memory Gaps: Scaling Federated Learning for Heterogeneous Clients. arXiv preprint arXiv:2408.10826","author":"Wu Yebo","year":"2024","unstructured":"Yebo Wu, Jingguang Li, Chunlin Tian, Kahou Tam, Li Li, and Chengzhong Xu. 2024. Bridging Memory Gaps: Scaling Federated Learning for Heterogeneous Clients. arXiv preprint arXiv:2408.10826 (2024)."},{"key":"e_1_3_2_1_62_1","volume-title":"Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V. 1. 1623\u20131632","author":"Wu Yebo","year":"2025","unstructured":"Yebo Wu, Li Li, and Cheng-zhong Xu. 2025. Breaking the memory wall for heterogeneous federated learning via progressive training. In Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V. 1. 1623\u20131632."},{"key":"e_1_3_2_1_63_1","unstructured":"Yebo Wu Chunlin Tian Jingguang Li He Sun Kahou Tam Zhanting Zhou Haicheng Liao Jing Xiong Zhijiang Guo Li Li et al. 2025. A survey on federated fine-tuning of large language models. arXiv preprint arXiv:2503.12016 (2025)."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3495243.3560545"},{"key":"e_1_3_2_1_65_1","volume-title":"On-device language models: A comprehensive review. arXiv preprint arXiv:2409.00088","author":"Xu Jiajun","year":"2024","unstructured":"Jiajun Xu, Zhiyuan Li, Wei Chen, Qun Wang, Xin Gao, Qi Cai, and Ziyuan Ling. 2024. On-device language models: A comprehensive review. arXiv preprint arXiv:2409.00088 (2024)."},{"key":"e_1_3_2_1_66_1","unstructured":"Mengwei Xu Wangsong Yin Dongqi Cai Rongjie Yi Daliang Xu Qipeng Wang Bingyang Wu Yihao Zhao Chen Yang Shihe Wang et al. 2024. A survey of resource-efficient llm and multimodal foundation models. arXiv preprint arXiv:2401.08092 (2024)."},{"key":"e_1_3_2_1_67_1","unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv et al. 2025. Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025)."},{"key":"e_1_3_2_1_68_1","unstructured":"Zhehao Zhang Ryan A Rossi Branislav Kveton Yijia Shao Diyi Yang Hamed Zamani Franck Dernoncourt Joe Barrow Tong Yu Sungchul Kim et al. 2024. Personalization of large language models: A survey. arXiv preprint arXiv:2411.00027 (2024)."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2024.3392335"},{"key":"e_1_3_2_1_70_1","volume-title":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"1","author":"Zhou Keren","year":"2026","unstructured":"Keren Zhou, Mario Lezcano-Casado, Adam P Goucher, Akhmed Rakhmati, Jeff Niu, Justin Lebar, Pawel Szczerbuk, Peter Bell, Phil Tillet, Thomas Raoux, et al. 2026. Linear Layouts: Robust Code Generation of Efficient Tensor Computation Using F_2. In Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1. 132\u2013146."}],"event":{"name":"MobiSys '26: 24th Annual International Conference on Mobile Systems, Applications and Services","location":"University of Cambridge Cambridge United Kingdom","acronym":"MobiSys '26","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 24th Annual International Conference on Mobile Systems, Applications and Services"],"original-title":[],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T13:02:38Z","timestamp":1780059758000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3745756.3809214"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,20]]},"references-count":70,"alternative-id":["10.1145\/3745756.3809214","10.1145\/3745756"],"URL":"https:\/\/doi.org\/10.1145\/3745756.3809214","relation":{},"subject":[],"published":{"date-parts":[[2026,6,20]]},"assertion":[{"value":"2026-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}