{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,18]],"date-time":"2026-01-18T23:45:31Z","timestamp":1768779931213,"version":"3.49.0"},"reference-count":77,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Science and Technology Development Fund of Macau","award":["0107\/2024\/RIA2"],"award-info":[{"award-number":["0107\/2024\/RIA2"]}]},{"name":"Joint Science and Technology Research Project with Hong Kong and Macau in Key Areas of Nansha District&#x2019;s Science and Technology Plan","award":["EF2024-00180-IOTSC"],"award-info":[{"award-number":["EF2024-00180-IOTSC"]}]},{"name":"Multi-Year Research Grant of University of Macau","award":["MYRG-GRG2023-00211-IOTSC-UMDF"],"award-info":[{"award-number":["MYRG-GRG2023-00211-IOTSC-UMDF"]}]},{"name":"Multi-Year Research Grant of University of Macau","award":["MYRG-GRG2024-00180-IOTSC"],"award-info":[{"award-number":["MYRG-GRG2024-00180-IOTSC"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1109\/tpds.2025.3587445","type":"journal-article","created":{"date-parts":[[2025,7,9]],"date-time":"2025-07-09T23:21:02Z","timestamp":1752103262000},"page":"2014-2029","source":"Crossref","is-referenced-by-count":2,"title":["m2LLM: A Multi-Dimensional Optimization Framework for LLM Inference on Mobile Devices"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-1904-6132","authenticated-orcid":false,"given":"Kaiyuan","family":"Liu","sequence":"first","affiliation":[{"name":"Faculty of Science and Technology, State Key Laboratory of IoTSC, University of Macau, Taipa, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9500-3390","authenticated-orcid":false,"given":"Xiaobo","family":"Zhou","sequence":"additional","affiliation":[{"name":"Faculty of Science and Technology, State Key Laboratory of IoTSC, University of Macau, Taipa, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2044-8289","authenticated-orcid":false,"given":"Li","family":"Li","sequence":"additional","affiliation":[{"name":"Faculty of Science and Technology, State Key Laboratory of IoTSC, University of Macau, Taipa, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Hello GPT-4o","year":"2024"},{"key":"ref2","article-title":"Reshaping industries with AI: Huawei cloud launches pangu models 3.0 and ascend AI cloud services","year":"2024"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.hcc.2024.100211"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3498361.3538928"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TCSI.2021.3064189"},{"key":"ref7","article-title":"Bigscience large open-science open-access multilingual language model 3B","year":"2024"},{"key":"ref8","article-title":"Snapdragon 8 gen 3 mobile platform","year":"2024"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.90"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"ref11","article-title":"Distserve: Disaggregating prefill and decoding for goodput-optimized large language model serving","author":"Zhong","year":"2024"},{"key":"ref12","first-page":"31094","article-title":"FlexGEN: High-throughput generative inference of large language models with a single GPU","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Sheng"},{"key":"ref13","article-title":"Inference of meta\u2019s LLaMA model (and others) in pure C\/C","year":"2024"},{"key":"ref14","article-title":"Powerinfer-2: Fast large language model inference on a smartphone","author":"Xue","year":"2024"},{"key":"ref15","first-page":"521","article-title":"Orca: A distributed serving system for $\\lbrace${Transformer-Based$\\rbrace$} generative models","volume-title":"Proc. 16th USENIX Symp. Operating Syst. Des. Implementation","author":"Yu"},{"key":"ref16","article-title":"Sarathi: Efficient LLM inference by piggybacking decodes with chunked prefills","author":"Agrawal","year":"2023"},{"key":"ref17","article-title":"A simple and effective pruning approach for large language models","author":"Sun","year":"2023"},{"key":"ref18","article-title":"When less is more: Investigating data pruning for pretraining LLMS at scale","author":"Marion","year":"2023"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29860"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3714983.3714987"},{"key":"ref21","article-title":"Atom: Low-bit quantization for efficient and accurate LLM serving","author":"Zhao","year":"2023"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01453-z"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1139"},{"key":"ref24","article-title":"MiniLM: Deep self-attention distillation for task-agnostic compression of pre-trained transformers","author":"Wang","year":"2020"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575698"},{"key":"ref26","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020"},{"key":"ref27","article-title":"Realme GT5 pro","year":"2024"},{"key":"ref28","article-title":"Llama 3 model card","year":"2024"},{"key":"ref29","article-title":"OPT: Open pre-trained transformer language models","author":"Zhang","year":"2022"},{"key":"ref30","article-title":"Static-kv-cache-and-torchcompile","year":"2024"},{"key":"ref31","article-title":"Bigscience large open-science open-access multilingual language model 1B7","year":"2024"},{"key":"ref32","article-title":"OPT: Open pre-trained transformer language models 2.7B","year":"2024"},{"key":"ref33","article-title":"OPT: Open pre-trained transformer language models 1.3B","year":"2024"},{"key":"ref34","article-title":"Zram: Compressed ram-based block devices","year":"2024"},{"key":"ref35","article-title":"Swapfaq","year":"2024"},{"key":"ref36","article-title":"Termux-app","year":"2024"},{"key":"ref37","article-title":"Overview of memory management","year":"2024"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3406522.3446038"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3106372"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"ref41","doi-asserted-by":"crossref","first-page":"469","DOI":"10.18653\/v1\/2024.blackboxnlp-1.29","article-title":"Investigating layer importance in large language models","volume-title":"Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP","author":"Zhang","year":"2024"},{"key":"ref42","first-page":"21702","article-title":"LLM-pruner: On the structural pruning of large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ma"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01544"},{"key":"ref44","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","volume-title":"Text Summarization Branches Out","author":"Lin","year":"2004"},{"key":"ref45","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","volume-title":"Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization","author":"Banerjee","year":"2005"},{"key":"ref46","article-title":"RoBERTa: A robustly optimized BERT pretraining approach","author":"Liu","year":"2019"},{"key":"ref47","article-title":"Pytorch","year":"2024"},{"key":"ref48","first-page":"38","article-title":"Transformers: State-of-the-art natural language processing","volume-title":"Proc. 2020 Conf. Empirical Methods Natural Lang. Process.: System Demonstrations","author":"Wolf"},{"key":"ref49","article-title":"Powermonitor","year":"2024"},{"key":"ref50","article-title":"MLC-LLM","year":"2023"},{"key":"ref51","article-title":"MNN: A universal and efficient inference engine","volume-title":"Proc. 6th ACM Int. Conf. Multimedia Asia Workshops","author":"Jiang"},{"key":"ref52","first-page":"311","article-title":"BLEU: A method for automatic evaluation of machine translation","volume-title":"Proc. 40th Annu. Meeting Assoc. Comput. Linguistics","author":"Papineni"},{"key":"ref53","article-title":"BERTScore: Evaluating text generation with BERT","author":"Zhang","year":"2020"},{"key":"ref54","article-title":"Tinychatengine: On-device llm\/vlm inference library","year":"2024"},{"key":"ref55","article-title":"A framework for few-shot language model evaluation","author":"Gao","year":"2024"},{"key":"ref56","article-title":"Rowan\/hellaswag","year":"2024"},{"key":"ref57","article-title":"ibragim-bad\/arc-easy","year":"2024"},{"key":"ref58","article-title":"Google\/boolq","year":"2024"},{"key":"ref59","article-title":"ybisk\/piqa","year":"2024"},{"key":"ref60","article-title":"allenai\/winogrande","year":"2024"},{"key":"ref61","article-title":"allenai\/openbookqa","year":"2024"},{"key":"ref62","article-title":"ibragim-bad\/arc-challenge","year":"2024"},{"key":"ref63","doi-asserted-by":"crossref","DOI":"10.1162\/tacl_a_00023","article-title":"The NarrativeQA reading comprehension challenge","volume-title":"Trans. Assoc. Comput. Linguistics","author":"s Ko\u010disk\u00fd","year":"2018"},{"key":"ref64","doi-asserted-by":"crossref","DOI":"10.1109\/CVPR52729.2023.01544","article-title":"DepGraph: Towards any structural pruning","author":"Fang","year":"2023"},{"key":"ref65","first-page":"30318","article-title":"GPT3. int8 (): 8-bit matrix multiplication for transformers at scale","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dettmers"},{"issue":"187","key":"ref66","first-page":"1","article-title":"Quantized neural networks: Training neural networks with low precision weights and activations","volume":"18","author":"Hubara","year":"2018","journal-title":"J. Mach. Learn. Res."},{"key":"ref67","article-title":"Just-in-time quantization with processing-in-memory for efficient ML training","author":"Ibrahim","year":"2023"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651368"},{"key":"ref69","article-title":"GPTQ: Accurate post-training quantization for generative pre-trained transformers","author":"Frantar","year":"2023"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.412"},{"key":"ref71","article-title":"Efficient prompting via dynamic in-context learning","author":"Zhou","year":"2023"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.391"},{"key":"ref73","author":"Yin","year":"2023","journal-title":""},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3403426"},{"key":"ref75","article-title":"RECOMP: Improving retrieval-augmented LMs with compression and selective augmentation","author":"Xu","year":"2023"},{"key":"ref76","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2023.emnlp-main.825","article-title":"LLMLingua: Compressing prompts for accelerated inference of large language models","author":"Jiang","year":"2023"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.91"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/71\/11123590\/11075620.pdf?arnumber=11075620","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,13]],"date-time":"2025-08-13T19:48:52Z","timestamp":1755114532000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11075620\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10]]},"references-count":77,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2025.3587445","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"value":"1045-9219","type":"print"},{"value":"1558-2183","type":"electronic"},{"value":"2161-9883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10]]}}}