{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,22]],"date-time":"2026-03-22T22:43:39Z","timestamp":1774219419132,"version":"3.50.1"},"reference-count":33,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,22]],"date-time":"2025-06-22T00:00:00Z","timestamp":1750550400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,22]],"date-time":"2025-06-22T00:00:00Z","timestamp":1750550400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100007219","name":"Natural Science Foundation of Shanghai","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100007219","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,22]]},"DOI":"10.1109\/dac63849.2025.11132862","type":"proceedings-article","created":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T17:35:41Z","timestamp":1757957741000},"page":"1-7","source":"Crossref","is-referenced-by-count":1,"title":["MILLION: MasterIng Long-Context LLM Inference Via Outlier-Immunized KV Product QuaNtization"],"prefix":"10.1109","author":[{"given":"Zongwu","family":"Wang","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Peng","family":"Xu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fangxin","family":"Liu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yiwei","family":"Hu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qingxiao","family":"Sun","sequence":"additional","affiliation":[{"name":"China University of Petroleum-Beijing"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gezi","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cheng","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuan","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Li","family":"Jiang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haibing","family":"Guan","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Longlora: Efficient fine-tuning of long-context large language models","author":"Chen","year":"2023","journal-title":"arXiv preprint arXiv:2309.12307"},{"key":"ref2","article-title":"Introducing claude 3.5 sonnet anthropic","year":"2024"},{"key":"ref3","article-title":"Models - openai api","year":"2024"},{"key":"ref4","first-page":"521","article-title":"Orca: A distributed serving system for {Transformer-Based} generative models","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu"},{"key":"ref5","article-title":"Taming throughput-latency tradeoff in 1 lm inference with sarathi-serve","author":"Agrawal","year":"2024","journal-title":"arXiv preprint arXiv:2403.02310"},{"key":"ref6","article-title":"Fast transformer decoding: One write-head is all you need","author":"Shazeer","year":"2019","journal-title":"arXiv preprint arXiv:1911.02150"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"ref8","article-title":"Deepseek-v2: A strong, economical, and efficient mixture-of-experts language model","author":"Liu","year":"2024","journal-title":"arXiv preprint arXiv:2405.04434"},{"key":"ref9","article-title":"You only cache once: Decoder-decoder architectures for language models","author":"Sun","year":"2024","journal-title":"arXiv preprint arXiv:2405.05254"},{"key":"ref10","article-title":"Chai: Clustered head attention for efficient 1 lm inference","author":"Agarwal","year":"2024","journal-title":"arXiv preprint arXiv:2403.08058"},{"key":"ref11","article-title":"Longformer: The long-document transformer","author":"Beltagy","year":"2020","journal-title":"arXiv preprint arXiv:2004.05150"},{"key":"ref12","article-title":"Efficient streaming language models with attention sinks","author":"Xiao","year":"2023","journal-title":"arXiv preprint arXiv:2309.17453"},{"key":"ref13","article-title":"H2o: Heavy-hitter oracle for efficient generative inference of large language models","volume":"36","author":"Zhang","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.195"},{"key":"ref15","article-title":"Corm: Cache optimization with recent message for large language model inference","author":"Dai"},{"key":"ref16","article-title":"Snapkv: Llm knows what you are looking for before generation","author":"Li","year":"2024","journal-title":"arXiv preprint arXiv:2404.14469"},{"key":"ref17","first-page":"114","article-title":"Keyformer: Kv cache reduction through key tokens selection for efficient generative inference","volume-title":"Proceedings of Machine Learning and Systems","volume":"6","author":"Adnan"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00060"},{"key":"ref19","article-title":"No token left behind: Reliable kv cache compression via importance-aware mixed precision quantization","author":"Yang","year":"2024","journal-title":"arXiv preprint arXiv:2402.18096"},{"key":"ref20","article-title":"Quest: Queryaware sparsity for efficient long-context llm inference","author":"Tang","year":"2024","journal-title":"arXiv preprint arXiv:2406.10774"},{"key":"ref21","article-title":"Extreme compression of large language models via additive quantization","author":"Egiazarian","year":"2024","journal-title":"arXiv preprint arXiv:2401.06118"},{"key":"ref22","article-title":"Quarot: Outlier-free 4-bit inference in rotated llms","author":"Ashkboos","year":"2024","journal-title":"arXiv preprint arXiv:2404.00456"},{"key":"ref23","article-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers","author":"Frantar","year":"2022","journal-title":"arXiv preprint arXiv:2210.17323"},{"key":"ref24","article-title":"Awq: Activation-aware weight quantization for 1 lm compression and acceleration","author":"Lin"},{"key":"ref25","article-title":"Qserve: W4a8kv4 quantization and system co-design for efficient 1 lm serving","author":"Lin","year":"2024","journal-title":"arXiv preprint arXiv:2405.04532"},{"key":"ref26","article-title":"Mamba: Linear-time sequence modeling with selective state spaces","author":"Gu","year":"2023","journal-title":"arXiv preprint arXiv:2312.00752"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.936"},{"key":"ref28","article-title":"Leave no context behind: Efficient infinite context transformers with infini-attention","author":"Munkhdalai","year":"2024","journal-title":"arXiv preprint arXiv:2404.07143"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2010.57"},{"key":"ref30","first-page":"177","article-title":"Results of the neurips\u201921 challenge on billion-scale approximate nearest neighbor search","author":"Simhadri","year":"2022","journal-title":"NeurIPS 2021 Competitions and Demonstrations Track."},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.172"},{"key":"ref32","article-title":"Kivi: A tuning-free asymmetric 2bit quantization for kv cache","author":"Liu","year":"2024","journal-title":"arXiv preprint arXiv:2402.02750"},{"key":"ref33","article-title":"Kvquant: Towards 10 million context length llm inference with kv cache quantization","author":"Hooper"}],"event":{"name":"2025 62nd ACM\/IEEE Design Automation Conference (DAC)","location":"San Francisco, CA, USA","start":{"date-parts":[[2025,6,22]]},"end":{"date-parts":[[2025,6,25]]}},"container-title":["2025 62nd ACM\/IEEE Design Automation Conference (DAC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11132383\/11132091\/11132862.pdf?arnumber=11132862","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T05:24:31Z","timestamp":1758000271000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11132862\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,22]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/dac63849.2025.11132862","relation":{},"subject":[],"published":{"date-parts":[[2025,6,22]]}}}