{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T06:47:00Z","timestamp":1774680420735,"version":"3.50.1"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T00:00:00Z","timestamp":1770076800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T00:00:00Z","timestamp":1770076800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,2,3]]},"DOI":"10.1109\/icce67443.2026.11449769","type":"proceedings-article","created":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T19:47:50Z","timestamp":1774640870000},"page":"1-6","source":"Crossref","is-referenced-by-count":0,"title":["On-Device-First Hybrid LLM Inference on AI-PCs: Closing the Enterprise GenAI Divide"],"prefix":"10.1109","author":[{"given":"Sultana","family":"Begum","sequence":"first","affiliation":[{"name":"Intel Corporation"}]},{"given":"Kris","family":"Fleming","sequence":"additional","affiliation":[{"name":"Intel Corporation"}]},{"given":"Todd","family":"Lewellen","sequence":"additional","affiliation":[{"name":"Intel Corporation"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Hymba: Hybrid-head architecture for small language models","author":"Dong","year":"2024"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.1659"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3762190"},{"key":"ref4","article-title":"Phi-3 technical report: A highly capable language model locally on your phone","author":"Abdin","year":"2024"},{"key":"ref5","article-title":"Qwen2.5 technical report","author":"Yang","year":"2024"},{"key":"ref6","article-title":"Challenging GPU dominance: When CPUs outperform for on-device LLM inference","author":"Zhang","year":"2025"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i10.33194"},{"key":"ref8","article-title":"TinyLLaVA: A framework of small-scale large multimodal models","author":"Zhou","year":"2024"},{"key":"ref9","article-title":"TinyGPT-V: Efficient multimodal large language model via small backbones","author":"Yuan","year":"2023"},{"key":"ref10","article-title":"MobileVLM: A fast, strong and open vision language assistant for mobile devices","author":"Chu","year":"2023"},{"key":"ref11","article-title":"The ultimate guide to fine-tuning LLMs from basics to breakthroughs: An exhaustive review of technologies, research, best practices, applied research challenges and opportunities","author":"Parthasarathy","year":"2024"},{"key":"ref12","article-title":"Parameter-efficient fine-tuning for large models: A comprehensive survey","author":"Han","year":"2024"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-demos.38"},{"key":"ref14","article-title":"RAG-Anything: All-in-one RAG framework","author":"Guo","year":"2025"},{"key":"ref15","article-title":"ReSum: Unlocking long-horizon search intelligence via context summarization","author":"Wu","year":"2025"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.3233\/FAIA251160"},{"key":"ref17","article-title":"The unreasonable effectiveness of scaling agents for computer use","author":"Gonzalez-Pumariega","year":"2025"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.17625"},{"key":"ref19","article-title":"MixPE: Quantization and hardware co-design for efficient LLM inference","author":"Zhang","year":"2024"},{"key":"ref20","article-title":"Scaling up throughput-oriented LLM inference applications on heterogeneous opportunistic GPU clusters with pervasive context management","author":"Phung","year":"2024","journal-title":"LLM Optimization Best Practices on GKE (data type, serving, scaling). Google Cloud ML Inference Guide"},{"issue":"21","key":"ref21","first-page":"34461","article-title":"MEMO: Fine-grained tensor management for ultra-long context LLM training","volume-title":"(AAAI) Efficient LLM Inference on Heterogeneous Platforms","volume":"39","author":"Zhao","year":"2025"},{"key":"ref22","article-title":"Optimizing LLM inference throughput via memory-aware and SLA-constrained dynamic batching","volume-title":"Performance and Energy Evaluation of LLMs on Modern Hardware","author":"Pang","year":"2025"},{"key":"ref23","article-title":"Investigating energy efficiency and performance trade-offs in LLM inference across tasks and DVFS settings","author":"Maliakel","year":"2025"},{"key":"ref24","article-title":"Efficient training of large language models on distributed infrastructures: A survey","author":"Duan","year":"2024"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref26","article-title":"LightMem: Lightweight and efficient memory-augmented generation","author":"Fang","year":"2025"},{"key":"ref27","doi-asserted-by":"crossref","DOI":"10.32388\/XI1064","article-title":"Towards more economical context-augmented LLM generation by reusing stored KV cache","author":"Li","year":"2025"},{"key":"ref28","article-title":"CAIM: Development and evaluation of a cognitive AI memory framework for long-term interaction with intelligent agents","author":"Westh\u00e4u\u00dfer","year":"2025"},{"key":"ref29","article-title":"A survey of context engineering for large language models","author":"Mei","year":"2025"},{"key":"ref30","article-title":"Collaborative inference and learning between edge SLMs and cloud LLMs: A survey of algorithms, execution, and open challenges","author":"Li","year":"2025"},{"key":"ref31","article-title":"Hybrid SD: Edge-cloud collaborative inference for Stable Diffusion models","author":"Yan","year":"2024"},{"key":"ref32","article-title":"A survey on deep neural network partition over cloud, edge and end devices","author":"Xu","year":"2023"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCSW63273.2025.00058"},{"key":"ref34","article-title":"UltraCUA: A foundation model for computer use agents with hybrid action","author":"Yang","year":"2025"},{"key":"ref35","doi-asserted-by":"crossref","DOI":"10.1145\/3662006.3662067","article-title":"Hybrid SLM and LLM for Edge-Cloud Collaborative Inference","volume-title":"Proceedings of the Workshop on Edge and Mobile Foundation Models (EdgeFM \u201924)","author":"Hao"},{"key":"ref36","article-title":"Rearchitecting datacenter lifecycle for AI: A TCO-driven framework","author":"Stojkovic","year":"2025"},{"key":"ref37","article-title":"An inquiry into datacenter TCO for LLM inference with FP8","author":"Kim","year":"2025"},{"key":"ref38","article-title":"Energy use of AI inference: Efficiency pathways and test-time compute","author":"Oviedo","year":"2025","journal-title":"Private LLM Integrator (LLM.co) Cost Range Estimates for Custom\/Hybrid\/Private Deployments"},{"key":"ref39","article-title":"Large language model inference acceleration: A comprehensive hardware perspective","author":"Li","year":"2024"},{"key":"ref40","article-title":"The GenAI Divide: State of AI in Business 2025","volume-title":"MIT NANDA, report","author":"Challapally","year":"2025"}],"event":{"name":"2026 IEEE International Conference on Consumer Electronics (ICCE)","location":"Dubai, United Arab Emirates","start":{"date-parts":[[2026,2,3]]},"end":{"date-parts":[[2026,2,5]]}},"container-title":["2026 IEEE International Conference on Consumer Electronics (ICCE)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11449575\/11449585\/11449769.pdf?arnumber=11449769","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T05:23:28Z","timestamp":1774675408000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11449769\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,3]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/icce67443.2026.11449769","relation":{},"subject":[],"published":{"date-parts":[[2026,2,3]]}}}