{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T21:14:27Z","timestamp":1780089267630,"version":"3.54.0"},"reference-count":285,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62302184"],"award-info":[{"award-number":["62302184"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Research Grants Council of the Hong Kong Special Administrative Region, China","award":["PolyU15222621"],"award-info":[{"award-number":["PolyU15222621"]}]},{"name":"Research Grants Council of the Hong Kong Special Administrative Region, China","award":["PolyU15225023"],"award-info":[{"award-number":["PolyU15225023"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Commun. Surv. Tutorials"],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/comst.2025.3580745","type":"journal-article","created":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T13:42:13Z","timestamp":1750254133000},"page":"1483-1519","source":"Crossref","is-referenced-by-count":4,"title":["Deploying Foundation Model Powered Agent Services: A Survey"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0983-387X","authenticated-orcid":false,"given":"Wenchao","family":"Xu","sequence":"first","affiliation":[{"name":"Division of Integrative Systems and Design, Hong Kong University of Science and Technology, Hong Kong, SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3502-0146","authenticated-orcid":false,"given":"Jinyu","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computing, The Hong Kong Polytechnic University, Hong Kong, SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2694-9451","authenticated-orcid":false,"given":"Peirong","family":"Zheng","sequence":"additional","affiliation":[{"name":"Department of Computing, The Hong Kong Polytechnic University, Hong Kong, SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaoquan","family":"Yi","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2152-2627","authenticated-orcid":false,"given":"Tianyi","family":"Tian","sequence":"additional","affiliation":[{"name":"School of Information and Communication Engineering, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wenhui","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of Information and Communication Engineering, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Quan","family":"Wan","sequence":"additional","affiliation":[{"name":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7591-5315","authenticated-orcid":false,"given":"Haozhao","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2277-5355","authenticated-orcid":false,"given":"Yunfeng","family":"Fan","sequence":"additional","affiliation":[{"name":"Department of Computing, The Hong Kong Polytechnic University, Hong Kong, SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5903-2504","authenticated-orcid":false,"given":"Qinliang","family":"Su","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4140-287X","authenticated-orcid":false,"given":"Xuemin","family":"Shen","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, University of Waterloo, Waterloo, ON, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"On the opportunities and risks of foundation models","author":"Bommasani","year":"2022"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4222-0"},{"key":"ref3","volume-title":"107 up-to-date ChatGPT statistics & user numbers","year":"2024"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.3390\/app15020586"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2023.3329027"},{"key":"ref6","volume-title":"Towards efficient generative large language model serving: A survey from algorithms to systems","author":"Miao","year":"2023"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2024.3353265"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2022.3199544"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3641289"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwae403"},{"key":"ref11","doi-asserted-by":"crossref","first-page":"1556","DOI":"10.1162\/tacl_a_00704","article-title":"A survey on model compression for large language models","volume":"12","author":"Zhu","year":"2024","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3675417.3675487"},{"key":"ref13","article-title":"A survey on knowledge distillation of large language models","author":"Xu","year":"2024","journal-title":"arXiv:2402.13116"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/s11704-024-40231-1"},{"key":"ref15","first-page":"8048","article-title":"Large language model based multi-agents: A survey of progress and challenges","volume-title":"Proc. 33rd Int. Joint Conf. Artif. Intell. (IJCAI)","author":"Guo"},{"key":"ref16","volume-title":"OpenAI o1 system card","author":"OpenAI","year":"2024"},{"key":"ref17","article-title":"DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning","author":"Guo","year":"2025","journal-title":"arXiv:2501.12948"},{"key":"ref18","volume-title":"OpenAI ChatGPT","year":"2024"},{"key":"ref19","volume-title":"Wenxin Yiyan","year":"2024"},{"key":"ref20","volume-title":"Github copilot: Your AI pair programmer","year":"2023"},{"key":"ref21","volume-title":"OpenAI codex","year":"2021"},{"key":"ref22","volume-title":"Midjourney","year":"2023"},{"key":"ref23","volume-title":"Sora","year":"2024"},{"key":"ref24","volume-title":"Gen-2 by runway","year":"2023"},{"key":"ref25","volume-title":"Auto-GPT","author":"Richards","year":"2023"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606763"},{"key":"ref27","first-page":"1","article-title":"Voyager: An open-ended embodied agent with large language models","author":"Wang","year":"2023","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.810"},{"key":"ref29","first-page":"1","article-title":"Position: Levels of AGI for operationalizing progress on the path to AGI","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Morris"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671837"},{"key":"ref31","first-page":"51963","article-title":"MAGIS: LLM-based multi-agent framework for GitHub issue resolution","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Tao"},{"key":"ref32","first-page":"25151","article-title":"How2Comm: Communication-efficient and collaboration-pragmatic multi-agent perception","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yang"},{"key":"ref33","first-page":"1","article-title":"Should we be going MAD? A look at multi-agent debate strategies for LLMs","volume-title":"Proc. 41st Int. Conf. Mach. Learn. (ICML)","author":"Smit"},{"key":"ref34","volume-title":"Harmonizing Users\u2019 and System\u2019s Requirements in Complex and Resource Intensive Application Domains by a Distributed Hybrid Approach","author":"Salutari","year":"2024"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-naacl.448"},{"key":"ref36","first-page":"59636","article-title":"Grounded decoding: Guiding text generation with grounded models for embodied agents","volume-title":"Proc. Conf. Neural Inf. Process. Syst.","author":"Huang"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.bea-1.62"},{"key":"ref38","article-title":"Multi-agent collaboration: Harnessing the power of intelligent LLM agents","author":"Talebirad","year":"2023","journal-title":"arXiv:2306.03314"},{"key":"ref39","first-page":"1","article-title":"Socialized learning: Making each other better through multi-agent collaboration","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Yao"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/OJCOMS.2024.3456549"},{"key":"ref41","article-title":"Dynamic LLM-agent network: An LLM-agent collaboration framework with agent team optimization","author":"Liu","year":"2023","journal-title":"arXiv:2310.02170"},{"key":"ref42","first-page":"1","article-title":"AgentVerse: Facilitating multi-agent collaboration and exploring emergent behaviors","volume-title":"Proc. 12th Int. Conf. Learn. Rep.","author":"Chen"},{"key":"ref43","first-page":"58","article-title":"AgentBench: Evaluating LLMs as agents","volume-title":"Proc. 12th Int. Conf. Learn. Rep.","author":"Liu"},{"key":"ref44","first-page":"1","article-title":"Building cooperative embodied agents modularly with large language models","volume-title":"Proc. 12th Int. Conf. Learn. Rep.","author":"Zhang"},{"key":"ref45","first-page":"4715","article-title":"Planning first, question second: An LLM-guided method for controllable question generation","volume-title":"Proc. Assoc. Comput. Linguist. (ACL)","author":"Li"},{"key":"ref46","first-page":"1","article-title":"DoraemonGPT: Toward understanding dynamic scenes with large language models (exemplified as a video agent)","volume-title":"Proc. 41st Int. Conf. Mach. Learn. (ICML)","author":"Yang"},{"key":"ref47","first-page":"3154","article-title":"Mindagent: Emergent gaming interaction","volume-title":"Proc. Findings Assoc. Comput. Linguist. (NAACL)","author":"Gong"},{"key":"ref48","first-page":"1","article-title":"Describe, explain, plan and select: Interactive planning with LLMs enables open-world multi-task agents","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Wang"},{"key":"ref49","first-page":"31967","article-title":"Large language models as commonsense knowledge for large-scale task planning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Zhao"},{"key":"ref50","first-page":"79081","article-title":"Leveraging pre-trained large language models to construct and utilize world models for model-based task planning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Guan"},{"key":"ref51","article-title":"Avalon\u2019s game of thoughts: Battle against deception through recursive contemplation","author":"Wang","year":"2023","journal-title":"arXiv:2310.01320"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.277"},{"key":"ref53","first-page":"50453","article-title":"MEMORYLLM: Towards self-updatable large language models","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Wang"},{"key":"ref54","first-page":"1","article-title":"Large language models are semi-parametric reinforcement learning agents","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Zhang"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.392"},{"key":"ref56","article-title":"From LLM to conversational agent: A memory enhanced architecture with fine-tuning of large language models","author":"Liu","year":"2024","journal-title":"arXiv:2401.02777"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657883"},{"key":"ref58","first-page":"60315","article-title":"Offline training of language model agents with functions as learnable weights","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","volume":"235","author":"Zhang"},{"key":"ref59","first-page":"1","article-title":"Toolformer: Language models can teach themselves to use tools","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Schick"},{"key":"ref60","first-page":"1","article-title":"Gpt4tools: Teaching large language model to use tools via self-instruction","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Yang"},{"key":"ref61","first-page":"371","article-title":"TPTU-V2: Boosting task planning and tool usage of large language model-based agents in real-world industry systems","volume-title":"Proc. Conf. Empirical Methods Nat. Lang. Process. Ind. Track","author":"Kong"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.2024.3389075"},{"key":"ref63","article-title":"CliqueParcel: An approach for batching LLM prompts that jointly optimizes efficiency and faithfulness","author":"Liu","year":"2024","journal-title":"arXiv:2402.14833"},{"key":"ref64","article-title":"NestFul: A benchmark for evaluating LLMs on nested sequences of API calls","author":"Basu","year":"2024","journal-title":"arXiv:2409.03797"},{"key":"ref65","first-page":"302","article-title":"Reverse chain: A generic-rule for LLMs to master multi-API planning","volume-title":"Proc. Findings Assoc. Comput. Linguist. (NAACL)","author":"Zhang"},{"key":"ref66","article-title":"Cache & distil: Optimizing API calls to large language models","author":"Ram\u00edrez","year":"2023","journal-title":"arXiv:2310.13561"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"issue":"140","key":"ref68","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"issue":"8","key":"ref69","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref70","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown"},{"key":"ref71","volume-title":"PanGu-\u03b1: Large-scale autoregressive pretrained Chinese language models with auto-parallel computation","author":"Zeng","year":"2021"},{"key":"ref72","article-title":"ERNIE 3.0: Large-scale knowledge enhanced pre-training for language understanding and generation","author":"Sun","year":"2021","journal-title":"arXiv:2107.02137"},{"key":"ref73","article-title":"Training compute-optimal large language models","author":"Hoffmann","year":"2022","journal-title":"arXiv:2203.15556"},{"key":"ref74","article-title":"Lamda: Language models for dialog applications","author":"Thoppilan","year":"2022","journal-title":"arXiv:2201.08239"},{"key":"ref75","article-title":"Using deepspeed and megatron to train megatron-turing NLG 530b, a large-scale generative language model","author":"Smith","year":"2022","journal-title":"arXiv:2201.11990"},{"key":"ref76","article-title":"Scaling language models: Methods, analysis & insights from training gopher","author":"Rae","year":"2021","journal-title":"arXiv:2112.11446"},{"key":"ref77","article-title":"OPT: Open pre-trained transformer language models","author":"Zhang","year":"2022","journal-title":"arXiv:2205.01068"},{"key":"ref78","article-title":"Galactica: A large language model for science","author":"Taylor","year":"2022","journal-title":"arXiv:2211.09085"},{"key":"ref79","article-title":"BLOOM: A 176B-parameter open-access multilingual language model","author":"Scao","year":"2022","journal-title":"arXiv:2211.05100"},{"key":"ref80","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv:2302.13971"},{"key":"ref81","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"key":"ref82","volume-title":"The llama 3 herd of models","author":"Grattafiori","year":"2024"},{"key":"ref83","volume-title":"Baichuan-7B: About: A large-scale 7B pretraining language model developed by BaiChuan-Inc","year":"2024"},{"key":"ref84","volume-title":"A 13B large language model developed by Baichuan intelligent technology","year":"2024"},{"key":"ref85","article-title":"Qwen technical report","volume-title":"arXiv:2309.16609","author":"Bai","year":"2023"},{"key":"ref86","volume-title":"Training-free long-context scaling of large language models","author":"An","year":"2024"},{"key":"ref87","article-title":"Qwen2 technical report","author":"Yang","year":"2024"},{"key":"ref88","article-title":"Qwen2.5 technical report","author":"Qwen","year":"2025"},{"key":"ref89","article-title":"Skywork: A more open bilingual foundation model","author":"Wei","year":"2023","journal-title":"arXiv:2310.19341"},{"key":"ref90","article-title":"The falcon series of open language models","author":"Almazrouei","year":"2023","journal-title":"arXiv:2311.16867"},{"key":"ref91","article-title":"Starcoder: May the source be with you!","author":"Li","year":"2023","journal-title":"arXiv:2305.06161"},{"key":"ref92","volume-title":"Finetuned language models are zero-shot learners","author":"Wei","year":"2022"},{"key":"ref93","article-title":"mT5: A massively multilingual pre-trained text-to-text transformer","author":"Xue","year":"2020","journal-title":"arXiv:2010.11934"},{"key":"ref94","volume-title":"Flan-MOE: Scaling instruction-finetuned language models with sparse mixture of experts","author":"Shen","year":"2023"},{"key":"ref95","volume-title":"Scaling instruction-finetuned language models","author":"Chung","year":"2022"},{"key":"ref96","volume-title":"Alpaca","author":"Taori","year":"2023"},{"key":"ref97","volume-title":"Training language models to self-correct via reinforcement learning","author":"Kumar","year":"2024"},{"key":"ref98","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wei"},{"key":"ref99","article-title":"DeepSeek-V3 technical report","author":"DeepSeek-AI","year":"2024"},{"key":"ref100","article-title":"Deepseekmath: Pushing the limits of mathematical reasoning in open language models","author":"Shao","year":"2024","journal-title":"arXiv:2402.03300"},{"key":"ref101","first-page":"95266","article-title":"MMLU-pro: A more robust and challenging multi-task language understanding benchmark","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"37","author":"Wang"},{"key":"ref102","article-title":"GPQA: A graduate-level Google-proof Q&A benchmark","author":"Rein","year":"2023","journal-title":"arXiv:2311.12022"},{"key":"ref103","volume-title":"Introducing SWE-bench verified","author":"Chowdhury","year":"2025"},{"key":"ref104","volume-title":"QwQ-32B: Embracing the power of reinforcement learning","author":"Team","year":"2025"},{"key":"ref105","volume-title":"Exploring large language model based intelligent agents: Definitions, methods, and prospects","author":"Cheng","year":"2024"},{"key":"ref106","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Alayrac"},{"key":"ref107","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref108","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv:2304.10592"},{"key":"ref109","article-title":"CogVLM: Visual expert for pretrained language models","author":"Wang","year":"2023","journal-title":"arXiv:2311.03079"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref111","article-title":"PandaGPT: One model to instruction-follow them all","author":"Su","year":"2023","journal-title":"arXiv:2305.16355"},{"key":"ref112","article-title":"Next-GPT: Any-to-any multimodal LLM","author":"Wu","year":"2023","journal-title":"arXiv:2309.05519"},{"key":"ref113","article-title":"OneLLM: One framework to align all modalities with language","author":"Han","year":"2023","journal-title":"arXiv:2312.03700"},{"key":"ref114","article-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv:2312.11805"},{"key":"ref115","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Reid","year":"2024","journal-title":"arXiv:2403.05530"},{"key":"ref116","article-title":"Qwen2.5-VL technical report","author":"Bai","year":"2025"},{"key":"ref117","article-title":"Shikra: Unleashing multimodal LLM\u2019s referential dialogue magic","author":"Chen","year":"2023","journal-title":"arXiv:2306.15195"},{"key":"ref118","article-title":"Ferret: Refer and ground anything anywhere at any granularity","author":"You","year":"2023","journal-title":"arXiv:2310.07704"},{"key":"ref119","volume-title":"Pix2seq: A language modeling framework for object detection","author":"Chen","year":"2022"},{"key":"ref120","article-title":"Next-chat: An LMM for chat, detection and segmentation","author":"Zhang","year":"2023","journal-title":"arXiv:2311.04498"},{"key":"ref121","volume-title":"Llama-4 multimodal intelligence","year":"2025"},{"key":"ref122","article-title":"A simple and effective pruning approach for large language models","author":"Sun","year":"2023","journal-title":"arXiv:2306.11695"},{"key":"ref123","first-page":"1","article-title":"LLM-pruner: On the structural pruning of large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Ma"},{"key":"ref124","volume-title":"LoRAPrune: Structured pruning meets low-rank parameter-efficient fine-tuning","author":"Zhang","year":"2024"},{"key":"ref125","article-title":"LoRaShear: Efficient large language model structured pruning and knowledge recovery","author":"Chen","year":"2023","journal-title":"arXiv:2310.18356"},{"key":"ref126","article-title":"Fluctuation-based adaptive structured pruning for large language models","author":"An","year":"2023","journal-title":"arXiv:2312.11983"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445737"},{"key":"ref128","article-title":"Compresso: Structured pruning with collaborative prompting learns compact large language models","author":"Guo","year":"2023","journal-title":"arXiv:2310.05015"},{"key":"ref129","article-title":"Sheared llama: Accelerating language model pre-training via structured pruning","author":"Xia","year":"2023","journal-title":"arXiv:2310.06694"},{"key":"ref130","article-title":"Pruning large language models via accuracy predictor","author":"Ji","year":"2023","journal-title":"arXiv:2309.09507"},{"key":"ref131","article-title":"Beyond size: How gradients shape pruning decisions in large language models","author":"Das","year":"2023","journal-title":"arXiv:2311.04902"},{"key":"ref132","first-page":"1","article-title":"ZipLM: Inference-aware structured pruning of language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Kurti\u0107"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i21.34366"},{"key":"ref134","first-page":"38087","article-title":"SmoothQuant: Accurate and efficient post-training quantization for large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Xiao"},{"key":"ref135","article-title":"RPTQ: Reorder-based post-training quantization for large language models","author":"Yuan","year":"2023","journal-title":"arXiv:2304.01089"},{"key":"ref136","article-title":"LoFTQ: LoRa-fine-tuning-aware quantization for large language models","author":"Li","year":"2023","journal-title":"arXiv:2310.08659"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.102"},{"key":"ref138","article-title":"FPTQ: Fine-grained post-training quantization for large language models","author":"Li","year":"2023","journal-title":"arXiv:2308.15987"},{"key":"ref139","article-title":"OWQ: Lessons learned from activation outliers for weight quantization in large language models","author":"Lee","year":"2023","journal-title":"arXiv:2306.02272"},{"key":"ref140","first-page":"87","article-title":"AWQ: Activation-aware weight quantization for on-device LLM compression and acceleration","volume-title":"Proc. Mach. Learn. Syst.","volume":"6","author":"Lin"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/ICME57554.2024.10688089"},{"key":"ref142","article-title":"OmniQuant: Omnidirectionally calibrated quantization for large language models","author":"Shao","year":"2023","journal-title":"arXiv:2308.13137"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.460"},{"key":"ref144","first-page":"36187","article-title":"Memory-efficient fine-tuning of compressed large language models via sub-4-bit integer quantization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Kim"},{"key":"ref145","article-title":"QLLM: Accurate and efficient low-bitwidth quantization for large language models","author":"Liu","year":"2023","journal-title":"arXiv:2310.08041"},{"key":"ref146","article-title":"OstQuant: Refining large language model Quantization with orthogonal and scaling transformations for better distribution fitting","author":"Hu","year":"2025","journal-title":"arXiv:2501.13987"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.507"},{"key":"ref148","article-title":"ZePHYR: Direct distillation of LM alignment","author":"Tunstall","year":"2023","journal-title":"arXiv:2310.16944"},{"key":"ref149","article-title":"LION: Adversarial distillation of closed-source large language model","author":"Jiang","year":"2023","journal-title":"arXiv:2305.12870"},{"key":"ref150","article-title":"PAD: Program-aided distillation specializes large models in reasoning","author":"Zhu","year":"2023","journal-title":"arXiv:2305.13888"},{"key":"ref151","article-title":"DistillSpec: Improving speculative decoding via knowledge distillation","author":"Zhou","year":"2023","journal-title":"arXiv:2310.08461"},{"key":"ref152","first-page":"20852","article-title":"Less is more: Task-aware layer-wise distillation for language model compression","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Liang"},{"key":"ref153","article-title":"HomoDistil: Homotopic task-agnostic distillation of pre-trained transformers","author":"Liang","year":"2023","journal-title":"arXiv:2302.09632"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.150"},{"key":"ref155","article-title":"Evolving knowledge distillation with large language models and active learning","author":"Liu","year":"2024","journal-title":"arXiv:2403.06414"},{"key":"ref156","first-page":"1378","article-title":"Minimal distillation schedule for extreme language model compression","volume-title":"Proc. Findings Assoc. Comput. Linguist. (EACL)","author":"Zhang"},{"key":"ref157","first-page":"1","article-title":"SLAM: Student-label mixing for distillation with unlabeled examples","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Kontonis"},{"key":"ref158","article-title":"Universalner: Targeted distillation from large language models for open named entity recognition","author":"Zhou","year":"2023","journal-title":"arXiv:2308.03279"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.304"},{"key":"ref160","article-title":"Contextualization distillation from large language model for knowledge graph completion","author":"Li","year":"2024","journal-title":"arXiv:2402.01729"},{"key":"ref161","first-page":"397","article-title":"{INFaaS}: Automated model-less inference serving","volume-title":"Proc. USENIX Annu. Tech. Conf. (USENIX ATC)","author":"Romero"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2022.3189186"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.14778\/3570690.3570692"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575698"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587438"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.406"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1145\/3565020"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1186\/s12911-020-01191-1"},{"key":"ref169","first-page":"1","article-title":"Fast inference from transformers via speculative decoding","volume-title":"Proc. 40th Int. Conf. Mach. Learn. (ICML)","author":"Leviathan"},{"key":"ref170","article-title":"LLMCAD: Fast and scalable on-device large language model inference","author":"Xu","year":"2023","journal-title":"arXiv:2309.04255"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"ref172","first-page":"1","article-title":"Sequoia: Scalable and robust speculative decoding","volume-title":"Proc. 38th Annu. Conf. Neural Inf. Process. Syst.","author":"Chen"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1145\/3530811"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00085"},{"key":"ref175","first-page":"1","article-title":"Token merging: Your ViT but faster","volume-title":"Proc. 11th Int. Conf. Learn. Rep.","author":"Bolya"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.391"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.825"},{"key":"ref178","first-page":"1","article-title":"H2O: Heavy-hitter oracle for efficient generative inference of large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Zhang"},{"key":"ref179","first-page":"1","article-title":"Model tells you what to discard: Adaptive KV cache compression for LLMs","volume-title":"Proc. 12th Int. Conf. Learn. Rep.","author":"Ge"},{"key":"ref180","first-page":"1","article-title":"Efficient streaming language models with attention sinks","volume-title":"Proc. 12th Int. Conf. Learn. Rep.","author":"Xiao"},{"key":"ref181","first-page":"1","article-title":"ThinK: Thinner key cache by query-driven pruning","volume-title":"Proc. 13th Int. Conf. Learn. Rep.","author":"Xu"},{"key":"ref182","first-page":"1","article-title":"DuoAttention: Efficient long-context LLM inference with retrieval and streaming heads","volume-title":"Proc. 13th Int. Conf. Learn. Rep.","author":"Xiao"},{"key":"ref183","first-page":"5","article-title":"MagicPIG: LSH sampling for efficient LLM generation","volume-title":"Proc. 13th Int. Conf. Learn. Rep.","author":"Chen"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.232"},{"key":"ref185","first-page":"1","article-title":"In-context autoencoder for context compression in a large language model","volume-title":"Proc. 12th Int. Conf. Learn. Rep.","author":"Ge"},{"key":"ref186","first-page":"19327","article-title":"Learning to compress prompts with gist tokens","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Mu"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071047"},{"key":"ref188","first-page":"1","article-title":"Synergistic patch pruning for vision transformer: Unifying intra-& inter-layer patch importance","volume-title":"Proc. 12th Int. Conf. Learn. Rep.","author":"Zhang"},{"key":"ref189","first-page":"9","article-title":"EViT: Expediting vision transformers via token reorganizations","volume-title":"Proc. Int. Conf. Learn. Rep.","author":"Liang"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00996"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00208"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73004-7_2"},{"key":"ref193","first-page":"4065","article-title":"LOOK-M: Look-once optimization in KV cache for efficient multimodal long-context inference","volume-title":"Proc. Findings Assoc. Comput. Linguist. (EMNLP)","author":"Wan"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM52122.2024.10621087"},{"key":"ref195","first-page":"38971","article-title":"Adaptive computation with elastic input sequence","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Xue"},{"key":"ref196","first-page":"613","article-title":"Clipper: A low-latency online prediction serving system","volume-title":"Proc. 14th USENIX Symp. Netw. Syst. Design Implement. (NSDI)","author":"Crankshaw"},{"key":"ref197","first-page":"1049","article-title":"MArk: Exploiting cloud services for cost-effective, SLO-aware machine learning inference serving","volume-title":"Proc. USENIX Annu. Tech. Conf. (USENIX ATC)","author":"Zhang"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421285"},{"key":"ref200","first-page":"443","article-title":"Serving DNNs like clockwork: Performance predictability from the bottom up","volume-title":"Proc. 14th USENIX Symp. Oper. Syst. Design Implement. (OSDI)","author":"Gujarati"},{"key":"ref201","first-page":"1041","article-title":"CockTail: A multidimensional optimization for model serving in cloud","volume-title":"Proc. 19th USENIX Symp. Netw. Syst. Design Implement. (NSDI)","author":"Gunasekaran"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1145\/3588195.3592997"},{"key":"ref203","first-page":"787","article-title":"SHEPHERD: Serving DNNs in the wild","volume-title":"Proc. 20th USENIX Symp. Netw. Syst. Design Implement. (NSDI)","author":"Zhang"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2020.05.057"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2020.2986615"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.2994308"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/TII.2020.2978946"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2024.3415661"},{"key":"ref210","article-title":"Serving long-context LLMs at the mobile edge: Test-time reinforcement learning-based model caching and inference offloading","author":"Xu","year":"2025","journal-title":"arXiv:2501.14205"},{"key":"ref211","first-page":"135","article-title":"ServerlessLLM: Low-latency serverless inference for large language models","volume-title":"Proc. 18th USENIX Symp. Oper. Syst. Design Implement. (OSDI)","author":"Fu"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM52122.2024.10621342"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695948"},{"key":"ref214","volume-title":"NVIDIA triton inference server","year":"2024"},{"key":"ref215","volume-title":"TensorFlow serving","year":"2024"},{"key":"ref216","article-title":"MegaTron-LM: Training multi-billion parameter language models using model parallelism","author":"Shoeybi","year":"2019","journal-title":"arXiv:1909.08053"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"ref218","first-page":"1","article-title":"LightSeq:: Sequence level parallelism for distributed training of long context transformers","volume-title":"Proc. Workshop Adv. Neural Netw. Training Comput. Efficiency Scalability Resource Optim. (WANT@NeurIPS)","author":"Li"},{"key":"ref219","first-page":"12312","article-title":"Distributed inference and fine-tuning of large language models over the Internet","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Borzunov"},{"key":"ref220","first-page":"1","article-title":"Taming throughput-latency tradeoff in LLM inference with Sarathi-serve","volume-title":"Proc. 18th USENIX Conf. Oper. Syst. Design Implement. (OSDI)","author":"Agrawal"},{"key":"ref221","first-page":"193","article-title":"DistServe: Disaggregating prefill and decoding for goodput-optimized large language model serving","volume-title":"Proc. 18th USENIX Conf. Oper. Syst. Design Implement. (OSDI)","author":"Zhong"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3672042"},{"key":"ref223","first-page":"1","article-title":"HEXGEN: Generative inference of large language model over heterogeneous environment","volume-title":"Proc. 41st Int. Conf. Mach. Learn. (ICML)","author":"Jiang"},{"key":"ref224","first-page":"1","article-title":"Helix: Distributed serving of large language models via max-flow on heterogeneous GPUs","volume-title":"Proc. ASPLOS Conf.","author":"Mei"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645416"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1109\/SOCC49529.2020.9524802"},{"key":"ref227","first-page":"250","article-title":"MNNFast: A fast and scalable system architecture for memory-augmented neural networks","volume-title":"Proc. 46th Int. Symp. Comput. Architect.","author":"Jang"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM57271.2023.00049"},{"key":"ref229","article-title":"A cost-efficient FPGA implementation of tiny transformer model using neural ODE","author":"Okubo","year":"2024","journal-title":"arXiv:2401.02721"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.1145\/3626202.3637562"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1109\/hpca47549.2020.00035"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00060"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480125"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2022.3170848"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD50377.2020.00047"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.23919\/DATE51398.2021.9474146"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/RTAS58335.2023.00036"},{"key":"ref239","first-page":"1","article-title":"Simplifying transformer blocks","volume-title":"Proc. 12th Int. Conf. Learn. Rep.","author":"He"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC55918.2022.00018"},{"key":"ref241","article-title":"Inference with reference: Lossless acceleration of large language models","author":"Yang","year":"2023","journal-title":"arXiv:2304.04487"},{"key":"ref242","volume-title":"Exponentially faster language modeling","author":"Belcak","year":"2023"},{"key":"ref243","volume-title":"Efficient LLM inference on CPUs","author":"Shen","year":"2023"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695964"},{"key":"ref245","first-page":"162","article-title":"HeteGen: Efficient heterogeneous parallel inference for large language models on resource-constrained devices","volume-title":"Proc. Mach. Learn. Syst.","author":"Zhao"},{"key":"ref246","first-page":"31094","article-title":"FlexGEN: High-throughput generative inference of large language models with a single GPU","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Sheng"},{"key":"ref247","first-page":"1","article-title":"Deja Vu: Contextual sparsity for efficient LLMs at inference time","volume-title":"Proc. 40th Int. Conf. Mach. Learn. (ICML)","author":"Liu"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.678"},{"key":"ref249","article-title":"Fast transformer decoding: One write-head is all you need","author":"Shazeer","year":"2019","journal-title":"arXiv:1911.02150"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref252","first-page":"1","article-title":"FLASHATTENTION: Fast and memory-efficient exact attention with IO-awareness","volume-title":"Proc. 36th Int. Conf. Neural Inf. Process. Syst. (NIPS)","author":"Dao"},{"key":"ref253","first-page":"148","article-title":"FlashDecoding++: Faster large language model inference with asynchronization, flat GEMM optimization, and heuristics","volume-title":"Proc. Mach. Learn. Syst.","volume":"6","author":"Hong"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"ref255","article-title":"Fast distributed inference serving for large language models","author":"Wu","year":"2023","journal-title":"arXiv:2305.05920"},{"key":"ref256","first-page":"18332","article-title":"DeepSpeed-MoE: Advancing mixture-of-experts inference and training to power next-generation AI scale","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Rajbhandari"},{"key":"ref257","article-title":"EdgeMOE: Fast on-device inference of MOE-based large language models","author":"Yi","year":"2023","journal-title":"arXiv:2308.14352"},{"key":"ref258","article-title":"Fast inference of mixture-of-experts language models with offloading","author":"Eliseev","year":"2023","journal-title":"arXiv:2312.17238"},{"key":"ref259","article-title":"MoE-infinity: Activation-aware expert offloading for efficient MoE serving","author":"Xue","year":"2024","journal-title":"arXiv:2401.14361"},{"key":"ref260","first-page":"6","article-title":"Fiddler: CPU-GPU orchestration for fast inference of mixture-of-experts models","volume-title":"Proc. 13th Int. Conf. Learn. Rep.","author":"Kamahori"},{"key":"ref261","volume-title":"Ggerganov\/llama.cpp: Port of Facebook\u2019s llama model in C\/C++","author":"Gerganov","year":"2023"},{"key":"ref262","volume-title":"Distributed llama","author":"Tadych","year":"2024"},{"key":"ref263","article-title":"TPI-LLM: Serving 70B-scale LLMs efficiently on low-resource edge devices","author":"Li","year":"2024","journal-title":"arXiv:2410.00531"},{"key":"ref264","doi-asserted-by":"publisher","DOI":"10.1109\/MNET.2023.3321529"},{"key":"ref265","doi-asserted-by":"publisher","DOI":"10.1109\/VTC2024-Spring62846.2024.10683673"},{"key":"ref266","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2024.3524255"},{"key":"ref267","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671679"},{"key":"ref268","first-page":"160","article-title":"LinguaLinked: Distributed large language model inference on mobile devices","volume-title":"Proc. 62nd Annu. Meeting Assoc. Comput. Linguist.","author":"Zhao"},{"key":"ref269","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2022.3222000"},{"key":"ref270","doi-asserted-by":"publisher","DOI":"10.1109\/MNET.2024.3352031"},{"key":"ref271","first-page":"5","article-title":"Towards low-bit communication for tensor parallel LLM inference","volume-title":"Proc. NeurIPS Workshop","author":"Dong"},{"key":"ref272","article-title":"Communicating activations between language model agents","author":"Ramesh","year":"2025","journal-title":"arXiv:2501.14082"},{"key":"ref273","volume-title":"MLC-LLM: Efficient and portable deployment of large language models","year":"2023"},{"key":"ref274","volume-title":"MNN-LLM: LLM deploy project based MNN","year":"2023"},{"key":"ref275","first-page":"14","article-title":"Judging LLM-as-a-judge with MT-bench and Chatbot arena","volume-title":"Proc. 37th Conf. Neural Inf. Process. Syst. Datasets Benchmarks Track","author":"Zheng"},{"key":"ref276","first-page":"783","article-title":"OpenVINO deep learning workbench: Comprehensive analysis and tuning of neural networks inference","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis. Workshops","author":"Gorbachev"},{"key":"ref277","volume-title":"MLLM is a fast and lightweight multimodal LLM inference engine for mobile and edge devices","year":"2025"},{"key":"ref278","volume-title":"FP6-LLM: Efficiently serving large language models through FP6-centric algorithm-system co-design","author":"Xia","year":"2024"},{"key":"ref279","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605613"},{"key":"ref280","first-page":"1","article-title":"Efficient large-scale language model training on GPU clusters using megatron-LM","volume-title":"Proc. Int. Conf. High Perform. Comput. Netw. Storage Anal.","author":"Narayanan"},{"key":"ref281","volume-title":"A TensorRT toolbox for Optimized large language model inference","year":"2023"},{"key":"ref282","volume-title":"LangChain","year":"2024"},{"key":"ref283","first-page":"1","article-title":"SGLang: Efficient execution of structured language model programs","volume-title":"Proc. 38th Annu. Conf. Neural Inf. Process. Syst.","author":"Zheng"},{"key":"ref284","first-page":"1","article-title":"Speculative RAG: Enhancing retrieval augmented generation through drafting","volume-title":"Proc. 13th Int. Conf. Learn. Rep.","author":"Wang"},{"key":"ref285","article-title":"FastDecode: High-throughput GPU-efficient LLM serving using heterogeneous pipelines","author":"He","year":"2024","journal-title":"arXiv:2403.11421"}],"container-title":["IEEE Communications Surveys &amp; Tutorials"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/9739\/11321210\/11039635.pdf?arnumber=11039635","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T05:49:31Z","timestamp":1767332971000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11039635\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":285,"URL":"https:\/\/doi.org\/10.1109\/comst.2025.3580745","relation":{},"ISSN":["1553-877X","2373-745X"],"issn-type":[{"value":"1553-877X","type":"electronic"},{"value":"2373-745X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}