{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,2]],"date-time":"2026-07-02T23:41:18Z","timestamp":1783035678219,"version":"3.54.6"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T00:00:00Z","timestamp":1769472000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2026,2,5]],"date-time":"2026-02-05T00:00:00Z","timestamp":1770249600000},"content-version":"vor","delay-in-days":9,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2025YFB4501405"],"award-info":[{"award-number":["2025YFB4501405"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Complex Intell. Syst."],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s40747-025-02200-4","type":"journal-article","created":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T05:53:12Z","timestamp":1769493192000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Multi-tier dynamic storage of KV cache for LLM inference under resource-constrained conditions"],"prefix":"10.1007","volume":"12","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-0248-008X","authenticated-orcid":false,"given":"Junliang","family":"Wang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiaqi","family":"Hu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qingping","family":"Cao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuanrui","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiancheng","family":"Lin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,1,27]]},"reference":[{"key":"2200_CR1","doi-asserted-by":"publisher","unstructured":"Cai F, Yuan D, Yang Z, et\u00a0al (2024) Edge-llm: a collaborative framework for large language model serving in edge computing. In: 2024 IEEE international conference on web services (ICWS). pp 799\u2013809. https:\/\/doi.org\/10.1109\/ICWS62655.2024.00099","DOI":"10.1109\/ICWS62655.2024.00099"},{"key":"2200_CR2","doi-asserted-by":"publisher","DOI":"10.1145\/3773075","author":"X Cao","year":"2025","unstructured":"Cao X, Xu M, Yu X et al (2025) Analytical survey of learning with low-resource data: from analysis to investigation. ACM Comput Surv. https:\/\/doi.org\/10.1145\/3773075","journal-title":"ACM Comput Surv"},{"key":"2200_CR3","doi-asserted-by":"publisher","DOI":"10.1145\/3676536.3676753","volume-title":"An agile framework for efficient LLM accelerator development and model inference","author":"L Chen","year":"2025","unstructured":"Chen L, Wu Y, Wen C et al (2025) An agile framework for efficient LLM accelerator development and model inference. Association for Computing Machinery, New York. https:\/\/doi.org\/10.1145\/3676536.3676753"},{"key":"2200_CR4","doi-asserted-by":"crossref","unstructured":"Chen Y, Wang G, Shang J, et\u00a0al (2024) NACL: a general and effective kv cache eviction framework for llms at inference time. In: Ku L, Martins A, Srikumar V (eds) Proceedings of the 62nd annual meeting of the association for computational linguistics, vol 1: long papers, 62nd annual meeting of the association-for-computational-linguistics (ACL)\/Student Research Workshop (SRW). Bangkok, pp 7913\u20137926","DOI":"10.18653\/v1\/2024.acl-long.428"},{"key":"2200_CR5","doi-asserted-by":"crossref","unstructured":"Chen Z, Luo Y, Qiu R, et al (2021) Semantics disentangling for generalized zeroshot learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE Computer Society, Los Alamitos, CA, USA, pp 8712\u20138720","DOI":"10.1109\/ICCV48922.2021.00859"},{"key":"2200_CR6","doi-asserted-by":"publisher","unstructured":"Chen Z, Zhang P, Li J, et\u00a0al (2023) Zero-shot learning by harnessing adversarial samples. In: Proceedings of the 31st ACM international conference on multimedia, MM \u201923. Association for Computing Machinery, New York, pp 4138\u20134146. https:\/\/doi.org\/10.1145\/3581783.3611823","DOI":"10.1145\/3581783.3611823"},{"key":"2200_CR7","unstructured":"Chen Z, Zhao Z, Guo J, et\u00a0al (2025) Svip: semantically contextualized visual patches for zero-shot learning. In: IEEE\/CVF international conference on computer vision (ICCV), IEEE Computer Society, Hawaii"},{"key":"2200_CR8","doi-asserted-by":"crossref","unstructured":"Disha DY, Mondal MNI (2023) Accelerate implementation of timsort algorithm using cuda. In: 2023 26th international conference on computer and information technology (ICCIT). IEEE, pp 1\u20135","DOI":"10.1109\/ICCIT60459.2023.10441404"},{"key":"2200_CR9","unstructured":"Dubey A, Jauhri A, Pandey A, et\u00a0al (2024) The llama 3 herd of models. arXiv e-prints pp arXiv-2407"},{"key":"2200_CR10","unstructured":"Feng Y, Lv J, Cao Y, et\u00a0al (2024) Ada-kv: optimizing kv cache eviction by adaptive budget allocation for efficient llm inference. arXiv:2407.11550"},{"key":"2200_CR11","unstructured":"Gao B, He Z, Sharma P, et\u00a0al (2024) Cost-efficient large language model serving for multi-turn conversations with CachedAttention. In: 2024 USENIX annual technical conference (USENIX ATC 24). pp 111\u2013126"},{"issue":"5","key":"2200_CR12","doi-asserted-by":"publisher","first-page":"3281","DOI":"10.1007\/s11831-021-09698-0","volume":"29","author":"FS Gharehchopogh","year":"2022","unstructured":"Gharehchopogh FS (2022) Advances in tree seed algorithm: a comprehensive survey. Arch Comput Methods Eng 29(5):3281\u20133304","journal-title":"Arch Comput Methods Eng"},{"issue":"7","key":"2200_CR13","doi-asserted-by":"publisher","first-page":"2195","DOI":"10.1109\/TC.2025.3558009","volume":"74","author":"Y Hu","year":"2025","unstructured":"Hu Y, Liu X, Yang G et al (2025) Tightllm: maximizing throughput for llm inference via adaptive offloading policy. IEEE Trans Comput 74(7):2195\u20132209. https:\/\/doi.org\/10.1109\/TC.2025.3558009","journal-title":"IEEE Trans Comput"},{"key":"2200_CR14","doi-asserted-by":"publisher","unstructured":"Kim H, Ryu J, Lee J (2024) Tccl: discovering better communication paths for pcie gpu clusters. In: Proceedings of the 29th ACM international conference on architectural support for programming languages and operating systems, vol 3, ASPLOS \u201924. Association for Computing Machinery, New York, pp 999\u20131015. https:\/\/doi.org\/10.1145\/3620666.3651362","DOI":"10.1145\/3620666.3651362"},{"issue":"10","key":"2200_CR15","doi-asserted-by":"publisher","first-page":"260","DOI":"10.1007\/s10462-024-10888-y","volume":"57","author":"P Kumar","year":"2024","unstructured":"Kumar P (2024) Large language models (llms): survey, technical frameworks, and future challenges. Artif Intell Rev 57(10):260. https:\/\/doi.org\/10.1007\/s10462-024-10888-y","journal-title":"Artif Intell Rev"},{"key":"2200_CR16","doi-asserted-by":"crossref","unstructured":"Kwon W, Li Z, Zhuang S, et al (2023) Efficient memory management for large language model serving with pagedattention. In: Proceedings of the 29th symposium on operating systems principles. Association for Computing Machinery, New York, NY, USA, pp 611\u2013626","DOI":"10.1145\/3600006.3613165"},{"key":"2200_CR17","doi-asserted-by":"crossref","unstructured":"Lee H, Kim K, Kim J, et al (2025) Disk-based shared kv cache management for fast inference in multi-instance llm rag systems. In: 2025 IEEE 18th International Conference on Cloud Computing (CLOUD). IEEE, Helsinki, Finland, pp 199\u2013209. https:\/\/doi.org\/10.1109\/CLOUD67622.2025.00029","DOI":"10.1109\/CLOUD67622.2025.00029"},{"key":"2200_CR18","unstructured":"Lee W, Lee J, Seo J et al (2024) InfiniGen: efficient generative inference of large language models with dynamic KV cache management. 18th USENIX symposium on operating systems design and implementation (OSDI 24). USENIX Association, Santa Clara, pp 155\u2013172"},{"key":"2200_CR19","unstructured":"Li J, Jiang Y, Zhu Y, et al (2023) Accelerating distributed MoE training and inference with lina. In: 2023 USENIX Annual Technical Conference (USENIX ATC 23). USENIX Association, Boston, MA, pp 945\u2013959"},{"key":"2200_CR20","doi-asserted-by":"crossref","unstructured":"Liao B, Vargas DV (2024) Beyond kv caching: shared attention for efficient llms. arXiv:2407.12866","DOI":"10.2139\/ssrn.5242694"},{"key":"2200_CR21","doi-asserted-by":"crossref","unstructured":"Liu Y, Li H, Cheng Y, et al (2024) Cachegen: KV cache compression and streaming for fast large language model serving. In: Proceedings of the ACM SIGCOMM 2024 Conference. Association for Computing Machinery, New York, NY, USA, pp 38\u201356","DOI":"10.1145\/3651890.3672274"},{"issue":"2","key":"2200_CR22","doi-asserted-by":"publisher","first-page":"217","DOI":"10.1109\/JETCAS.2025.3568716","volume":"15","author":"A Moradifirouzabadi","year":"2025","unstructured":"Moradifirouzabadi A, Kang M (2025) End-to-end acceleration of generative models with runtime regularized kv cache management. IEEE J Emerg Sel Top Circuits Syst 15(2):217\u2013230. https:\/\/doi.org\/10.1109\/JETCAS.2025.3568716","journal-title":"IEEE J Emerg Sel Top Circuits Syst"},{"key":"2200_CR23","unstructured":"NVIDIA (2022) NVIDIA A10 gpu accelerator-Product brief. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/a10\/pdf\/A10-Product-Brief.pdf"},{"key":"2200_CR24","doi-asserted-by":"publisher","unstructured":"Oh H, Kim K, Kim J, et\u00a0al (2024) Exegpt: constraint-aware resource scheduling for llm inference. In: Proceedings of the 29th ACM international conference on architectural support for programming languages and operating systems, vol 2, ASPLOS \u201924. Association for Computing Machinery, New York, pp 369\u2013384. https:\/\/doi.org\/10.1145\/3620665.3640383","DOI":"10.1145\/3620665.3640383"},{"key":"2200_CR25","unstructured":"OpenAI (2024) Introducing ChatGPT search. https:\/\/openai.com\/index\/introducing-chatgpt-search\/"},{"key":"2200_CR26","doi-asserted-by":"crossref","unstructured":"Pan X, Li E, Li Q, et\u00a0al (2025) InstAttention: in-storage attention offloading for cost-effective long-context LLM inference. In: 2025 IEEE international symposium on high performance computer architecture (HPCA). IEEE, pp 1510\u20131525","DOI":"10.1109\/HPCA61900.2025.00113"},{"key":"2200_CR27","doi-asserted-by":"crossref","unstructured":"Papaioannou K, Dimitra\u00a0Doudali T (2024) Improving the efficiency of llm inference serving systems. European conference on parallel processing. Springer, pp 342\u2013347","DOI":"10.1007\/978-3-031-90203-1_39"},{"key":"2200_CR28","unstructured":"Qin R, Li Z, He W, et\u00a0al (2025) Mooncake: trading more storage for less computation\u2014a KVCache-centric architecture for serving LLM chatbot. In: 23rd USENIX conference on file and storage technologies (FAST 25). pp 155\u2013170"},{"key":"2200_CR29","doi-asserted-by":"publisher","unstructured":"Ren Z, Doekemeijer K, De\u00a0Matteis T, et\u00a0al (2025) An i\/o characterizing study of offloading llm models and kv caches to nvme ssd. In: CHEOPS \u201825: Proceedings of the 5th workshop on challenges and opportunities of efficient and performant storage systems, SIGOPS, euroSys \u201825: Twentieth European conference on computer systems. Rotterdam, Netherlands, pp 23\u201333. https:\/\/doi.org\/10.1145\/3719330.3721230","DOI":"10.1145\/3719330.3721230"},{"key":"2200_CR30","unstructured":"Ren Z, Li Y, Wang Z, et al (2025) Enabling efficient GPU communication over multiple NICs with FuseLink. In: 19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 25). USENIX Association, Boston, MA, pp 91\u2013108"},{"key":"2200_CR31","unstructured":"Shibing624 (2023) Random dataset based on wikipedia content. https:\/\/github.com\/theripnono\/get-random-wikipedia-content"},{"key":"2200_CR32","unstructured":"Shibing624 (2024) ShareGPT dataset. https:\/\/huggingface.co\/datasets\/shibing624\/sharegpt_gpt4"},{"issue":"4","key":"2200_CR33","doi-asserted-by":"publisher","first-page":"613","DOI":"10.1007\/s11227-025-07118-9","volume":"81","author":"S Shrestha","year":"2025","unstructured":"Shrestha S, Gautam A, Reddy N (2025) Storage access optimization for efficient gpu-centric information retrieval. J Supercomput 81(4):613","journal-title":"J Supercomput"},{"key":"2200_CR34","doi-asserted-by":"crossref","unstructured":"Song Y, Mi Z, Xie H, et al (2024) Powerinfer: Fast large language model serving with a consumer-grade gpu. In: Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles. Association for Computing Machinery, New York, NY, USA, pp 590\u2013606","DOI":"10.1145\/3694715.3695964"},{"key":"2200_CR35","doi-asserted-by":"publisher","unstructured":"Vijaya\u00a0Kumar A, Antichi G, Singh R (2025) Aqua: network-accelerated memory offloading for LLMs in scale-Up GPU domains. Association for Computing Machinery, New York, pp 48\u201362. https:\/\/doi.org\/10.1145\/3676641.3715983","DOI":"10.1145\/3676641.3715983"},{"key":"2200_CR36","doi-asserted-by":"publisher","unstructured":"Wang Y, Chen K, Tan H, et\u00a0al (2023) Tabi: an efficient multi-level inference system for large language models. In: Proceedings of the eighteenth European conference on computer systems, EuroSys \u201923. Association for Computing Machinery, New York, pp 233\u2013248. https:\/\/doi.org\/10.1145\/3552326.3587438","DOI":"10.1145\/3552326.3587438"},{"key":"2200_CR37","doi-asserted-by":"crossref","unstructured":"Wei T, Yu X, Chen Z, et\u00a0al (2025) Augment to segment: tackling pixel-level imbalance in wheat disease and pest segmentation. arXiv:2509.09961","DOI":"10.1007\/978-981-95-6196-4_3"},{"key":"2200_CR38","unstructured":"Wu B, Zhong Y, Zhang Z, et\u00a0al (2023) Fast distributed inference serving for large language models. arXiv:2305.05920"},{"key":"2200_CR39","first-page":"119638","volume-title":"Advances in neural information processing systems","author":"C Xiao","year":"2024","unstructured":"Xiao C, Zhang P, Han X et al (2024) Infllm: training-free long-context extrapolation for llms with an efficient context memory. In: Globerson A, Mackey L, Belgrave D (eds) Advances in neural information processing systems, vol 37. Curran Associates, Inc., New York, pp 119638\u2013119661"},{"key":"2200_CR40","doi-asserted-by":"crossref","unstructured":"Ye J, Cernuda J, Maurya A, et al (2025) Characterizing the behavior and impact of kv caching on transformer inferences under concurrency. In: 2025 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, Milano, Italy, pp 1191\u20131202","DOI":"10.1109\/IPDPS64566.2025.00108"},{"key":"2200_CR41","doi-asserted-by":"publisher","unstructured":"You F, Li J, Zhu L, et\u00a0al (2021) Domain adaptive semantic segmentation without source data. In: Proceedings of the 29th ACM international conference on multimedia, MM \u201921. Association for Computing Machinery, New York, pp 3293\u20133302. https:\/\/doi.org\/10.1145\/3474085.3475482","DOI":"10.1145\/3474085.3475482"},{"issue":"10","key":"2200_CR42","doi-asserted-by":"publisher","first-page":"13119","DOI":"10.1109\/JIOT.2024.3524255","volume":"12","author":"M Zhang","year":"2025","unstructured":"Zhang M, Shen X, Cao J et al (2025) Edgeshard: efficient llm inference via collaborative edge computing. IEEE Internet Things J 12(10):13119\u201313131. https:\/\/doi.org\/10.1109\/JIOT.2024.3524255","journal-title":"IEEE Internet Things J"},{"key":"2200_CR43","doi-asserted-by":"publisher","unstructured":"Zhao Z, Chen Z, Huang Z, et\u00a0al (2025) Continual text-to-video retrieval with frame fusion and task-aware routing. In: Proceedings of the 48th international ACM SIGIR conference on research and development in information retrieval, SIGIR \u201925. Association for Computing Machinery, New York, pp 1011\u20131021. https:\/\/doi.org\/10.1145\/3726302.3729936","DOI":"10.1145\/3726302.3729936"},{"key":"2200_CR44","doi-asserted-by":"publisher","DOI":"10.1145\/3719664","author":"Y Zheng","year":"2025","unstructured":"Zheng Y, Chen Y, Qian B et al (2025) A review on edge large language models: design, execution, and applications. ACM Comput Surv. https:\/\/doi.org\/10.1145\/3719664","journal-title":"ACM Comput Surv"},{"key":"2200_CR45","unstructured":"Zhu Y, Tang Z, Liu X, et al (2025) OracleKV: Oracle guidance for questionindependent KV cache compression. In: ICML 2025 Workshop on Long-Context Foundation Models, Vancouver, CA. https:\/\/openreview.net\/forum?id=KHM2YOGgX9"}],"container-title":["Complex &amp; Intelligent Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s40747-025-02200-4","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40747-025-02200-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40747-025-02200-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T07:56:43Z","timestamp":1774425403000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s40747-025-02200-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,27]]},"references-count":45,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["2200"],"URL":"https:\/\/doi.org\/10.1007\/s40747-025-02200-4","relation":{},"ISSN":["2199-4536","2198-6053"],"issn-type":[{"value":"2199-4536","type":"print"},{"value":"2198-6053","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,27]]},"assertion":[{"value":"28 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 November 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 January 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}],"article-number":"104"}}