{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T13:09:18Z","timestamp":1775912958061,"version":"3.50.1"},"reference-count":60,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"5","license":[{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62394324"],"award-info":[{"award-number":["62394324"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Young Elite Scientists Sponsorship Program by CAST","award":["2022QNRC001"],"award-info":[{"award-number":["2022QNRC001"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Serv. Comput."],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1109\/tsc.2025.3596892","type":"journal-article","created":{"date-parts":[[2025,8,8]],"date-time":"2025-08-08T18:40:10Z","timestamp":1754678410000},"page":"3321-3333","source":"Crossref","is-referenced-by-count":7,"title":["TPI-LLM: Serving 70B-Scale LLMs Efficiently on Low-Resource Mobile Devices"],"prefix":"10.1109","volume":"18","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2796-039X","authenticated-orcid":false,"given":"Zonghang","family":"Li","sequence":"first","affiliation":[{"name":"Department of Machine Learning, MBZUAI, Abu Dhabi, UAE"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6117-2999","authenticated-orcid":false,"given":"Wenjiao","family":"Feng","sequence":"additional","affiliation":[{"name":"School of Information and Communication Engineering, UESTC, Chengdu, China"}]},{"given":"Mohsen","family":"Guizani","sequence":"additional","affiliation":[{"name":"Department of Machine Learning, MBZUAI, Abu Dhabi, UAE"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5219-1780","authenticated-orcid":false,"given":"Hongfang","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Information and Communication Engineering, UESTC, Chengdu, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Megatron-LM: Training multi-billion parameter language models using model parallelism","author":"Shoeybi","year":"2019"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref3","first-page":"663","article-title":"AlpaServe: Statistical multiplexing with model parallelism for deep learning serving","volume-title":"Proc. 17th USENIX Symp. Operating Syst. Des. Implementation","author":"Li"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"ref5","article-title":"Infinite-LLM: Efficient LLM service for long context with distattention and distributed kvcache","author":"Lin","year":"2024"},{"key":"ref6","first-page":"21946","article-title":"HEXGEN: Generative inference of foundation model over heterogeneous decentralized environment","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Jiang"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640383"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/jiot.2024.3524255"},{"key":"ref10","article-title":"Helix: Distributed serving of large language models via max-flow on heterogeneous GPUs","author":"Mei","year":"2024"},{"key":"ref11","first-page":"12312","article-title":"Distributed inference and fine-tuning of large language models over the internet","volume-title":"Proc. 37th Adv. Neural Inf. Process. Syst.","author":"Borzunov"},{"key":"ref12","article-title":"Distributed llama","author":"Tadych","year":"2024"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM52122.2024.10621342"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref15","first-page":"31094","article-title":"FlexGen: High-throughput generative inference of large language models with a single GPU","volume-title":"Proc. 40th Int. Conf. Mach. Learn.","author":"Sheng"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707256"},{"key":"ref17","first-page":"499","article-title":"PipeSwitch: Fast pipelined context switching for deep learning applications","volume-title":"Proc. 14th USENIX Symp. Operating Syst. Des. Implementation","author":"Bai"},{"key":"ref18","first-page":"162","article-title":"HeteGen: Efficient heterogeneous parallel inference for large language models on resource-constrained devices","volume-title":"Proc. 6th Mach. Learn. Syst.","author":"Xuanlei"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695964"},{"key":"ref20","article-title":"Accelerate: Training and inference at scale made simple, efficient and adaptable","author":"Gugger","year":"2022"},{"key":"ref21","article-title":"llama.cpp: LLM inference in C\/C","author":"Gerganov","year":"2024"},{"key":"ref22","first-page":"2025","article-title":"Klonet: An easy-to-use and scalable platform for computer networks education","volume-title":"Proc. 21st USENIX Symp. Netw. Syst. Des. Implementation","author":"Ma"},{"key":"ref23","article-title":"The llama 3 herd of models","author":"Dubey","year":"2024"},{"key":"ref24","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref25","article-title":"Yi: Open foundation models by 01. AI","author":"Young","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"ref27","article-title":"Model compression and efficient inference for large language models: A survey","author":"Wang","year":"2024"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3656177"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00051"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/tcsi.2025.3546256"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3626202.3637562"},{"key":"ref32","first-page":"16344","article-title":"FLASHATTENTION: Fast and memory-efficient exact attention with IO-awareness","volume-title":"Proc. 35th Adv. Neural Inf. Process. Syst.","author":"Dao"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS54959.2023.00042"},{"key":"ref34","article-title":"FlashDecoding: Faster large language model inference on GPUs","author":"Hong","year":"2023"},{"key":"ref35","article-title":"LightLLM: A Python-based LLM inference and serving framework","year":"2024"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref37","article-title":"MobileLLM: Optimizing sub-billion parameter language models for on-device use cases","author":"Liu","year":"2024"},{"key":"ref38","article-title":"What is the role of small models in the LLM era: A survey","author":"Chen","year":"2024"},{"key":"ref39","article-title":"exo: Run your own AI cluster at home with everyday devices","year":"2024"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/GLOBECOM52923.2024.10901542"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/SDS60720.2024.00036"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3736721"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3603287.3651205"},{"key":"ref44","article-title":"On-device language models: A comprehensive review","author":"Xu","year":"2024"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICWS62655.2024.00099"},{"key":"ref46","article-title":"MNN-LLM: LLM deployment project based on MNN","author":"Wang","year":"2024"},{"key":"ref47","article-title":"MLLM: Fast multimodal LLM on mobile devices","year":"2024"},{"key":"ref48","article-title":"MLC-LLM: Universal LLM deployment engine with ML compilation","year":"2023"},{"key":"ref49","first-page":"1","article-title":"MediaPipe: A framework for perceiving and processing reality","volume-title":"Proc. 3rd Workshop Comput. Vis. AR\/VR CVPR","author":"Lugaresi"},{"key":"ref50","article-title":"A survey of resource-efficient LLM and multimodal foundation models","author":"Xu","year":"2024"},{"key":"ref51","article-title":"PocketPal AI: An app that brings language models directly to your phone","author":"Ghorbani","year":"2024"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.23919\/date58400.2024.10546617"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2017.12.012"},{"key":"ref55","article-title":"MXNet: A flexible and efficient machine learning library for heterogeneous distributed systems","author":"Chen","year":"2015"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2024.3412429"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TNSM.2021.3106315"},{"key":"ref58","first-page":"10088","article-title":"QLoRA: Efficient finetuning of quantized LLMs","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dettmers"},{"key":"ref59","article-title":"Mamba: Linear-time sequence modeling with selective state spaces","author":"Gu","year":"2023"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.936"}],"container-title":["IEEE Transactions on Services Computing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/4629386\/11198176\/11119787.pdf?arnumber=11119787","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,10]],"date-time":"2025-10-10T22:30:51Z","timestamp":1760135451000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11119787\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9]]},"references-count":60,"journal-issue":{"issue":"5"},"URL":"https:\/\/doi.org\/10.1109\/tsc.2025.3596892","relation":{},"ISSN":["1939-1374","2372-0204"],"issn-type":[{"value":"1939-1374","type":"electronic"},{"value":"2372-0204","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9]]}}}