{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,6]],"date-time":"2026-04-06T20:47:16Z","timestamp":1775508436019,"version":"3.50.1"},"reference-count":21,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,22]],"date-time":"2025-06-22T00:00:00Z","timestamp":1750550400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,22]],"date-time":"2025-06-22T00:00:00Z","timestamp":1750550400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,22]]},"DOI":"10.1109\/dac63849.2025.11132542","type":"proceedings-article","created":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T17:35:41Z","timestamp":1757957741000},"page":"1-7","source":"Crossref","is-referenced-by-count":0,"title":["KVO-LLM: Boosting Long-Context Generation Throughput for Batched LLM Inference"],"prefix":"10.1109","author":[{"given":"Zhenyu","family":"Li","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University,State Key Laboratory of Micro\/Nano Engineering Science, School of Electronic Information and Electrical Engineering,Shanghai,China"}]},{"given":"Dongxu","family":"Lyu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,State Key Laboratory of Micro\/Nano Engineering Science, School of Electronic Information and Electrical Engineering,Shanghai,China"}]},{"given":"Gang","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,State Key Laboratory of Micro\/Nano Engineering Science, School of Electronic Information and Electrical Engineering,Shanghai,China"}]},{"given":"Yuzhou","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,State Key Laboratory of Micro\/Nano Engineering Science, School of Electronic Information and Electrical Engineering,Shanghai,China"}]},{"given":"Liyan","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,State Key Laboratory of Micro\/Nano Engineering Science, School of Electronic Information and Electrical Engineering,Shanghai,China"}]},{"given":"Wenjie","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,State Key Laboratory of Micro\/Nano Engineering Science, School of Electronic Information and Electrical Engineering,Shanghai,China"}]},{"given":"Jianfei","family":"Jiang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,State Key Laboratory of Micro\/Nano Engineering Science, School of Electronic Information and Electrical Engineering,Shanghai,China"}]},{"given":"Yanan","family":"Sun","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,State Key Laboratory of Micro\/Nano Engineering Science, School of Electronic Information and Electrical Engineering,Shanghai,China"}]},{"given":"Guanghui","family":"He","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,State Key Laboratory of Micro\/Nano Engineering Science, School of Electronic Information and Electrical Engineering,Shanghai,China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"A Survey on Efficient Inference for Large Language Models","volume":"abs\/2404.14294","author":"Zhou","year":"2024","journal-title":"CoRR"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651380"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640422"},{"key":"ref4","first-page":"155","article-title":"InfiniGen: Efficient Generative Inference of Large Language Models with Dynamic KV Cache Management","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2024","author":"Lee"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/isca59077.2024.00077"},{"key":"ref6","article-title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","author":"Touvron","year":"2023"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/isca59077.2024.00080"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3649329.3657323"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/isca59077.2024.00079"},{"key":"ref10","article-title":"KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache","volume-title":"Forty-first International Conference on Machine Learning, ICML 2024","author":"Liu"},{"key":"ref11","article-title":"Q-Hitter: A Better Token Oracle for Efficient LLM Inference via Sparse-Quantized KV Cache","volume-title":"Proceedings of the Seventh Annual Conference on Machine Learning and Systems, MLSys 2024","author":"Zhang"},{"key":"ref12","article-title":"Efficient Streaming Language Models with Attention Sinks","volume-title":"The Twelfth International Conference on Learning Representations, ICLR 2024","author":"Xiao"},{"key":"ref13","article-title":"H2O: HeavyHitter Oracle for Efficient Generative Inference of Large Language Models","volume-title":"Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023","author":"Zhang"},{"key":"ref14","article-title":"QUEST: Query-Aware Sparsity for Efficient Long-Context LLM Inference","volume-title":"Forty-first International Conference on Machine Learning, ICML 2024","author":"Tang"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.172"},{"key":"ref16","article-title":"How Long Can Open-Source LLMs Truly Promise on Context Length?","author":"Li","year":"2023"},{"key":"ref17","article-title":"Vicuna: An Open-Source Chatbot Impressing GPT4 with 90% Chatgpt Quality","author":"Chiang","year":"2023"},{"key":"ref18","article-title":"Automatic differentiation in PyTorch","author":"Paszke","year":"2017"},{"key":"ref19","first-page":"38","article-title":"Transformers: State-of-the-Art Natural Language Processing","author":"Wolf","year":"2020"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/micro.2007.33"},{"key":"ref21","first-page":"41","article-title":"Fine-Grained DRAM: Energy-Efficient DRAM for Extreme Bandwidth Systems","volume-title":"Proceedings of the 50th Annual IEEE\/ACM International Symposium on Microarchitecture, ser. MICRO50\u201917","author":"O\u2019Connor"}],"event":{"name":"2025 62nd ACM\/IEEE Design Automation Conference (DAC)","location":"San Francisco, CA, USA","start":{"date-parts":[[2025,6,22]]},"end":{"date-parts":[[2025,6,25]]}},"container-title":["2025 62nd ACM\/IEEE Design Automation Conference (DAC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11132383\/11132091\/11132542.pdf?arnumber=11132542","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,6]],"date-time":"2026-04-06T19:52:55Z","timestamp":1775505175000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11132542\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,22]]},"references-count":21,"URL":"https:\/\/doi.org\/10.1109\/dac63849.2025.11132542","relation":{},"subject":[],"published":{"date-parts":[[2025,6,22]]}}}