{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,23]],"date-time":"2026-02-23T21:26:51Z","timestamp":1771882011857,"version":"3.50.1"},"reference-count":28,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["RS-2025-08182968"],"award-info":[{"award-number":["RS-2025-08182968"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/access.2026.3661980","type":"journal-article","created":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T20:51:07Z","timestamp":1770411067000},"page":"24801-24811","source":"Crossref","is-referenced-by-count":0,"title":["MaaI: ML-as-an-Infrastructure for Diverse Personalized Inference With Edge Offloading"],"prefix":"10.1109","volume":"14","author":[{"given":"Junyong","family":"Lee","sequence":"first","affiliation":[{"name":"School of Integrated Technology, Yonsei University, Seoul, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3153-469X","authenticated-orcid":false,"given":"Jeihee","family":"Cho","sequence":"additional","affiliation":[{"name":"School of Integrated Technology, Yonsei University, Seoul, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9935-1721","authenticated-orcid":false,"given":"Shiho","family":"Kim","sequence":"additional","affiliation":[{"name":"School of Integrated Technology, Yonsei University, Seoul, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv:2302.13971"},{"key":"ref2","volume-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"ref3","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref4","volume-title":"Inside Github: Working With the Llms Behind Github Copilot","author":"Verdi","year":"2024"},{"key":"ref5","article-title":"Code llama: Open foundation models for code","author":"Rozi\u00e8re","year":"2023","journal-title":"arXiv:2308.12950"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-023-06291-2"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1038\/s41591-024-03423-7"},{"key":"ref8","article-title":"PaLM: Scaling language modeling with pathways","author":"Chowdhery","year":"2022","journal-title":"arXiv:2204.02311"},{"key":"ref9","article-title":"PaLM 2 technical report","volume-title":"arXiv:2305.10403","author":"Anil","year":"2023"},{"key":"ref10","article-title":"Phi-3 technical report: A highly capable language model locally on your phone","volume-title":"arXiv:2404.14219","author":"Abdin","year":"2024"},{"key":"ref11","article-title":"MobileLLM: Optimizing sub-billion parameter language models for on-device use cases","author":"Liu","year":"2024","journal-title":"arXiv:2402.14905"},{"key":"ref12","first-page":"6010","article-title":"Searching for efficient transformers for language modeling","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"So"},{"key":"ref13","first-page":"22470","article-title":"Combiner: Full attention transformer with sparse computation cost","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ren"},{"key":"ref14","first-page":"16344","article-title":"FlashAttention: Fast and memory-efficient exact attention with IO-awareness","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dao"},{"key":"ref15","article-title":"FlashAttention-2: Faster attention with better parallelism and work partitioning","author":"Dao","year":"2023","journal-title":"arXiv:2307.08691"},{"key":"ref16","volume-title":"Peft: State-of-the-Art Parameter-Efficient Fine-Tuning Methods","author":"Mangrulkar","year":"2022"},{"key":"ref17","article-title":"Towards a unified view of parameter-efficient transfer learning","author":"He","year":"2021","journal-title":"arXiv:2110.04366"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"ref19","first-page":"4582","article-title":"Prefix-tuning: Optimizing continuous prompts for generation","volume-title":"Proc. 59th Annu. Meeting Assoc. Comput. Linguistics 11th Int. Joint Conf. Natural Lang. Process.","author":"Li"},{"key":"ref20","article-title":"LoRA: Low-rank adaptation of large language models","author":"Hu","year":"2021","journal-title":"arXiv:2106.09685"},{"key":"ref21","article-title":"QLoRA: Efficient finetuning of quantized LLMs","author":"Dettmers","year":"2023","journal-title":"arXiv:2305.14314"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3714983.3714987"},{"key":"ref23","first-page":"196","article-title":"Atom: Low-bit quantization for efficient and accurate LLM serving","volume-title":"Proc. Mach. Learn. Syst.","author":"Zhao"},{"key":"ref24","article-title":"Pointer sentinel mixture models","author":"Merity","year":"2016","journal-title":"arXiv:1609.07843"},{"key":"ref25","first-page":"70","article-title":"SAMSum corpus: A human-annotated dialogue dataset for abstractive summarization","volume-title":"Proc. 2nd Workshop New Frontiers Summarization","author":"Gliwa"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.3390\/app11146421"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0146250"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1177\/0018720809359349"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/6287639\/11323511\/11373326-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6287639\/11323511\/11373326.pdf?arnumber=11373326","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,23]],"date-time":"2026-02-23T20:48:57Z","timestamp":1771879737000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11373326\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":28,"URL":"https:\/\/doi.org\/10.1109\/access.2026.3661980","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}