{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T19:05:56Z","timestamp":1765479956608,"version":"3.48.0"},"reference-count":36,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Ministry of Education AcRF Tier 2","award":["MOE-T2EP20224-0020"],"award-info":[{"award-number":["MOE-T2EP20224-0020"]}]},{"name":"Guangdong S&#x0026;T Program","award":["2024B0101040005"],"award-info":[{"award-number":["2024B0101040005"]}]},{"name":"Beijing Normal University","award":["28700-312200502503"],"award-info":[{"award-number":["28700-312200502503"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1109\/tpds.2025.3626974","type":"journal-article","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T17:13:36Z","timestamp":1761930816000},"page":"90-105","source":"Crossref","is-referenced-by-count":0,"title":["Efficient KV Cache Spillover Management on Memory-Constrained GPU for LLM Inference"],"prefix":"10.1109","volume":"37","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1417-3012","authenticated-orcid":false,"given":"Jiazhi","family":"Jiang","sequence":"first","affiliation":[{"name":"Beijing Normal University, Zhuhai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5798-2282","authenticated-orcid":false,"given":"Yao","family":"Chen","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7992-426X","authenticated-orcid":false,"given":"Zining","family":"Zhang","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8618-4581","authenticated-orcid":false,"given":"Bingsheng","family":"He","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pingyi","family":"Luo","sequence":"additional","affiliation":[{"name":"4Paradigm Inc, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mian","family":"Lu","sequence":"additional","affiliation":[{"name":"4Paradigm Inc, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9339-2510","authenticated-orcid":false,"given":"Yuqiang","family":"Chen","sequence":"additional","affiliation":[{"name":"4Paradigm Inc, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9145-3919","authenticated-orcid":false,"given":"Hongbing","family":"Zhang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4707-9492","authenticated-orcid":false,"given":"Jiangsu","family":"Du","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5582-1031","authenticated-orcid":false,"given":"Dan","family":"Huang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5315-3375","authenticated-orcid":false,"given":"Yutong","family":"Lu","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"article-title":"Improving language understanding by generative pre-training","year":"2018","author":"Radford","key":"ref1"},{"article-title":"OPT: Open pre-trained transformer language models","year":"2022","author":"Zhang","key":"ref2"},{"article-title":"LLaMA: Open and efficient foundation language models","year":"2023","author":"Touvron","key":"ref3"},{"article-title":"LLaMA 2: Open foundation and fine-tuned chat models","year":"2023","author":"Touvron","key":"ref4"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.14778\/3626292.3626303"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3732941"},{"key":"ref8","first-page":"521","article-title":"Orca: A distributed serving system for transformer-based generative models","volume-title":"Proc. 16th USENIX Symp. Operating Syst. Des. Implementation","author":"Yu"},{"article-title":"Response length perception and sequence scheduling: An LLM-empowered LLM inference pipeline","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zheng","key":"ref9"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2023.3266110"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"year":"2025","key":"ref13","article-title":"4Paradigm. Wikipedia, The Free Encyclopedia"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/isca59077.2024.00019"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.14778\/3551793.3551828"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783721"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE53745.2022.00241"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378530"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2023.3269530"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2023.3280805"},{"key":"ref21","first-page":"606","article-title":"Efficiently scaling transformer inference","volume-title":"Proc. 6th Conf. Mach. Learn. Syst.","author":"Pope"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-industry.8"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695964"},{"key":"ref25","first-page":"16","article-title":"FlashAttention: Fast and memory-efficient exact attention with IO-awareness","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Dao"},{"article-title":"FlashAttention-2: Faster attention with better parallelism and work partitioning","year":"2023","author":"Dao","key":"ref26"},{"key":"ref27","article-title":"Flash-decoding for long-context inference","author":"Dao","year":"2023","journal-title":"PyTorch Blog3"},{"article-title":"FlashDecoding++: Faster large language model inference on GPUs","year":"2023","author":"Hong","key":"ref28"},{"key":"ref29","first-page":"31","article-title":"FlexGen: High-throughput generative inference of large language models with a single GPU","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Sheng"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.14778\/3489496.3489500"},{"article-title":"DistServe: Disaggregating prefill and decoding for goodput-optimized large language model serving","year":"2024","author":"Zhong","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.14778\/3570690.3570697"},{"article-title":"Model tells you what to discard: Adaptive KV cache compression for LLMs","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Ge","key":"ref33"},{"key":"ref34","first-page":"3304","article-title":"KV cache is 1 bit per channel: Efficient large language model inference with coupled quantization","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhang"},{"key":"ref35","first-page":"32","article-title":"KIVI: A tuning-free asymmetric 2bit quantization for KV cache","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Liu"},{"key":"ref36","first-page":"1270","article-title":"KVQuant: Towards 10 million context length LLM inference with KV cache quantization","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Hooper"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/71\/11261373\/11223035.pdf?arnumber=11223035","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T18:47:32Z","timestamp":1765478852000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11223035\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1]]},"references-count":36,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2025.3626974","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"type":"print","value":"1045-9219"},{"type":"electronic","value":"1558-2183"},{"type":"electronic","value":"2161-9883"}],"subject":[],"published":{"date-parts":[[2026,1]]}}}