{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T06:57:13Z","timestamp":1772866633665,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"the National Natural Science Foundation of China","award":["62222411"],"award-info":[{"award-number":["62222411"]}]},{"name":"the National Natural Science Foundation of China","award":["62025404"],"award-info":[{"award-number":["62025404"]}]},{"name":"the National Key R\\&D Program of China","award":["2023YFB4404400"],"award-info":[{"award-number":["2023YFB4404400"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3676641.3716252","type":"proceedings-article","created":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T16:47:32Z","timestamp":1743094052000},"page":"131-146","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["COMET: Towards Practical W4A4KV4 LLMs Serving"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2226-2303","authenticated-orcid":false,"given":"Lian","family":"Liu","sequence":"first","affiliation":[{"name":"Institute of Computing Technology, CAS, Beijing, China, University of Chinese Academy of Sciences, Beijing, China, and Zhongguancun Laboratory, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1638-059X","authenticated-orcid":false,"given":"Long","family":"Cheng","sequence":"additional","affiliation":[{"name":"North China Electric Power University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2287-7503","authenticated-orcid":false,"given":"Haimeng","family":"Ren","sequence":"additional","affiliation":[{"name":"ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8334-6903","authenticated-orcid":false,"given":"Zhaohui","family":"Xu","sequence":"additional","affiliation":[{"name":"ShanghaiTech University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0012-4113","authenticated-orcid":false,"given":"Yudong","family":"Pan","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, CAS, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7012-2308","authenticated-orcid":false,"given":"Mengdi","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, CAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0874-814X","authenticated-orcid":false,"given":"Xiaowei","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, CAS, Beijing, China and Zhongguancun Laboratory, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0904-6681","authenticated-orcid":false,"given":"Yinhe","family":"Han","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, CAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5172-4736","authenticated-orcid":false,"given":"Ying","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, CAS, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"cublas docs 2024. https:\/\/docs.nvidia.com\/cuda\/cublas\/index.html."},{"key":"e_1_3_2_1_2_1","unstructured":"Qwen2 technical report. 2024."},{"key":"e_1_3_2_1_3_1","first-page":"117","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. Taming {Throughput-Latency} tradeoff in {LLM} inference with {Sarathi-Serve}. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 117--134, 2024."},{"key":"e_1_3_2_1_4_1","volume-title":"Quarot: Outlier-free 4-bit inference in rotated llms. arXiv preprint arXiv:2404.00456","author":"Ashkboos Saleh","year":"2024","unstructured":"Saleh Ashkboos, Amirkeivan Mohtashami, Maximilian L Croci, Bo Li, Martin Jaggi, Dan Alistarh, Torsten Hoefler, and James Hensman. Quarot: Outlier-free 4-bit inference in rotated llms. arXiv preprint arXiv:2404.00456, 2024."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6239"},{"key":"e_1_3_2_1_6_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901, 2020."},{"key":"e_1_3_2_1_7_1","first-page":"36","article-title":"2-bit quantization of large language models with guarantees","author":"Chee Jerry","year":"2024","unstructured":"Jerry Chee, Yaohui Cai, Volodymyr Kuleshov, and ChristopherMDe Sa. Quip: 2-bit quantization of large language models with guarantees. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_8_1","volume-title":"Think you have solved question answering? try arc, the ai2 reasoning challenge. arXiv preprint arXiv:1803.05457","author":"Clark Peter","year":"2018","unstructured":"Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, and Oyvind Tafjord. Think you have solved question answering? try arc, the ai2 reasoning challenge. arXiv preprint arXiv:1803.05457, 2018."},{"key":"e_1_3_2_1_9_1","first-page":"16344","article-title":"Flashattention: Fast and memory-efficient exact attention with io awareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. Flashattention: Fast and memory-efficient exact attention with io awareness. Advances in Neural Information Processing Systems, 35:16344--16359, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_10_1","volume-title":"int8 (): 8-bit matrix multiplication for transformers at scale. arXiv preprint arXiv:2208.07339","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. Llm. int8 (): 8-bit matrix multiplication for transformers at scale. arXiv preprint arXiv:2208.07339, 2022."},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Learning Representations","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. An image is worth 16x16 words: Transformers for image recognition at scale. In International Conference on Learning Representations, 2020."},{"key":"e_1_3_2_1_12_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323, 2022."},{"key":"e_1_3_2_1_13_1","volume-title":"A framework for few-shot language model evaluation. Version v0. 0.1. Sept, page 8","author":"Gao Leo","year":"2021","unstructured":"Leo Gao, Jonathan Tow, Stella Biderman, Sid Black, Anthony DiPofi, Charles Foster, Laurence Golding, Jeffrey Hsu, Kyle McDonell, Niklas Muennighoff, et al. A framework for few-shot language model evaluation. Version v0. 0.1. Sept, page 8, 2021."},{"key":"e_1_3_2_1_14_1","volume-title":"ggerganov\/llama.cpp: Port of facebook's llama model in c\/c","author":"Gerganov Georgi","year":"2023","unstructured":"Georgi Gerganov. ggerganov\/llama.cpp: Port of facebook's llama model in c\/c., 2023. https:\/\/github.com\/ggerganov\/llama.cpp."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589038"},{"key":"e_1_3_2_1_16_1","unstructured":"Connor Holmes Masahiro Tanaka Michael Wyatt Ammar Ahmad Awan Jeff Rasley Samyam Rajbhandari Reza Yazdani Aminabadi Heyang Qin Arash Bakhtiari Lev Kurilenko et al. Deepspeed-fastgen: High-throughput text generation for llms via mii and deepspeedinference. arXiv preprint arXiv:2401.08671 2024."},{"key":"e_1_3_2_1_17_1","first-page":"148","article-title":"Flashdecoding: Faster large language model inference with asynchronization, flat gemm optimization, and heuristics","volume":"6","author":"Hong Ke","year":"2024","unstructured":"Ke Hong, Guohao Dai, Jiaming Xu, Qiuli Mao, Xiuhong Li, Jun Liu, Yuhan Dong, Yu Wang, et al. Flashdecoding: Faster large language model inference with asynchronization, flat gemm optimization, and heuristics. Proceedings of Machine Learning and Systems, 6:148--161, 2024.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_18_1","volume-title":"Kurt Keutzer, and Amir Gholami. Kvquant: Towards 10 million context length llm inference with kv cache quantization. arXiv preprint arXiv:2401.18079","author":"Hooper Coleman","year":"2024","unstructured":"Coleman Hooper, Sehoon Kim, Hiva Mohammadzadeh, Michael W Mahoney, Yakun Sophia Shao, Kurt Keutzer, and Amir Gholami. Kvquant: Towards 10 million context length llm inference with kv cache quantization. arXiv preprint arXiv:2401.18079, 2024."},{"key":"e_1_3_2_1_19_1","first-page":"680","article-title":"Automatic load-compute pipelining in deep learning compiler for ai-gpus","volume":"5","author":"Huang Guyue","year":"2023","unstructured":"Guyue Huang, Yang Bai, Liu Liu, Yuke Wang, Bei Yu, Yufei Ding, and Yuan Xie. Alcop: Automatic load-compute pipelining in deep learning compiler for ai-gpus. Proceedings of Machine Learning and Systems, 5:680--694, 2023.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4842-8844-3_4"},{"key":"e_1_3_2_1_21_1","volume-title":"Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. Mistral 7b. arXiv preprint arXiv:2310.06825","author":"Jiang Albert Q","year":"2023","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. Mistral 7b. arXiv preprint arXiv:2310.06825, 2023."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3497776.3517770"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_24_1","volume-title":"Owq: Lessons learned from activation outliers for weight quantization in large language models. arXiv preprint arXiv:2306.02272","author":"Lee Changhun","year":"2023","unstructured":"Changhun Lee, Jungyu Jin, Taesu Kim, Hyungjun Kim, and Eunhyeok Park. Owq: Lessons learned from activation outliers for weight quantization in large language models. arXiv preprint arXiv:2306.02272, 2023."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW55747.2022.00124"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3542929.3563510"},{"key":"e_1_3_2_1_27_1","volume-title":"E-sparse: Boosting the large language model inference through entropy-based n: M sparsity. arXiv preprint arXiv:2310.15929","author":"Li Yun","year":"2023","unstructured":"Yun Li, Lin Niu, Xipeng Zhang, Kai Liu, Jianchen Zhu, and Zhanhui Kang. E-sparse: Boosting the large language model inference through entropy-based n: M sparsity. arXiv preprint arXiv:2310.15929, 2023."},{"key":"e_1_3_2_1_28_1","volume-title":"Awq: Activation-aware weight quantization for llm compression and acceleration. arXiv preprint arXiv:2306.00978","author":"Lin Ji","year":"2023","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Xingyu Dang, and Song Han. Awq: Activation-aware weight quantization for llm compression and acceleration. arXiv preprint arXiv:2306.00978, 2023."},{"key":"e_1_3_2_1_29_1","volume-title":"Qserve: W4a8kv4 quantization and system co-design for efficient llm serving. arXiv preprint arXiv:2405.04532","author":"Lin Yujun","year":"2024","unstructured":"Yujun Lin, Haotian Tang, Shang Yang, Zhekai Zhang, Guangxuan Xiao, Chuang Gan, and Song Han. Qserve: W4a8kv4 quantization and system co-design for efficient llm serving. arXiv preprint arXiv:2405.04532, 2024."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.39"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_32_1","volume-title":"Spinquant--llm quantization with learned rotations. arXiv preprint arXiv:2405.16406","author":"Liu Zechun","year":"2024","unstructured":"Zechun Liu, Changsheng Zhao, Igor Fedorov, Bilge Soran, Dhruv Choudhary, Raghuraman Krishnamoorthi, Vikas Chandra, Yuandong Tian, and Tijmen Blankevoort. Spinquant--llm quantization with learned rotations. arXiv preprint arXiv:2405.16406, 2024."},{"key":"e_1_3_2_1_33_1","first-page":"22137","volume-title":"International Conference on Machine Learning","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Jue Wang, Tri Dao, Tianyi Zhou, Binhang Yuan, Zhao Song, Anshumali Shrivastava, Ce Zhang, Yuandong Tian, Christopher Re, et al. Deja vu: Contextual sparsity for efficient llms at inference time. In International Conference on Machine Learning, pages 22137--22176. PMLR, 2023."},{"key":"e_1_3_2_1_34_1","volume-title":"Forty-first International Conference on Machine Learning","author":"Liu Zirui","year":"2024","unstructured":"Zirui Liu, Jiayi Yuan, Hongye Jin, Shaochen Zhong, Zhaozhuo Xu, Vladimir Braverman, Beidi Chen, and Xia Hu. Kivi: A tuning-free asymmetric 2bit quantization for kv cache. In Forty-first International Conference on Machine Learning, 2024."},{"key":"e_1_3_2_1_35_1","volume-title":"International Conference on Learning Representations","author":"Merity Stephen","year":"2016","unstructured":"Stephen Merity, Caiming Xiong, James Bradbury, and Richard Socher. Pointer sentinel mixture models. In International Conference on Learning Representations, 2016."},{"key":"e_1_3_2_1_36_1","unstructured":"NVIDIA. Cutlass 3.2 2024. https:\/\/github.com\/NVIDIA\/cutlass."},{"key":"e_1_3_2_1_37_1","volume-title":"Nsight compute profilling guide","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. Nsight compute profilling guide, 2024. https:\/\/docs.nvidia.com\/nsight-compute\/ProfilingGuide\/#introduction."},{"key":"e_1_3_2_1_38_1","volume-title":"Nsight system","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. Nsight system., 2024. https:\/\/developer.nvidia.com\/nsightsystems."},{"key":"e_1_3_2_1_39_1","volume-title":"Parallel thread execution isa version 8.5","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. Parallel thread execution isa version 8.5, 2024. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html."},{"key":"e_1_3_2_1_40_1","unstructured":"NVIDIA. Tensorrt-llm 2024. https:\/\/github.com\/NVIDIA\/TensorRTLLM."},{"key":"e_1_3_2_1_41_1","volume-title":"Gpt-4 technical report","author":"AI.","year":"2023","unstructured":"OpenAI. Gpt-4 technical report, 2023."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577479"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_44_1","volume-title":"Channel permutations for n: m sparsity. Advances in neural information processing systems, 34:13316--13327","author":"Pool Jeff","year":"2021","unstructured":"Jeff Pool and Chong Yu. Channel permutations for n: m sparsity. Advances in neural information processing systems, 34:13316--13327, 2021."},{"key":"e_1_3_2_1_45_1","volume-title":"Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, 21(140):1--67","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou,Wei Li, and Peter J Liu. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, 21(140):1--67, 2020."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474381"},{"key":"e_1_3_2_1_48_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Shao Wenqi","year":"2023","unstructured":"Wenqi Shao, Mengzhao Chen, Zhaoyang Zhang, Peng Xu, Lirui Zhao, Zhiqian Li, Kaipeng Zhang, Peng Gao, Yu Qiao, and Ping Luo. Omniquant: Omnidirectionally calibrated quantization for large language models. In The Twelfth International Conference on Learning Representations, 2023."},{"key":"e_1_3_2_1_49_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Sun Mingjie","year":"2023","unstructured":"Mingjie Sun, Zhuang Liu, Anna Bair, and J Zico Kolter. A simple and effective pruning approach for large language models. In The Twelfth International Conference on Learning Representations, 2023."},{"key":"e_1_3_2_1_50_1","volume-title":"Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie- Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971, 2023."},{"key":"e_1_3_2_1_51_1","volume-title":"Llama 2: Open foundation and finetuned chat models. arXiv preprint arXiv:2307.09288","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, et al. Llama 2: Open foundation and finetuned chat models. arXiv preprint arXiv:2307.09288, 2023."},{"key":"e_1_3_2_1_52_1","volume-title":"Flash-decoding for long-context inference","author":"Daniel Dao","year":"2023","unstructured":"Tri, Dao and Daniel, Haziza and Francisco, Massa and Grigory, Sizov. Flash-decoding for long-context inference, 2023. https:\/\/pytorch.org\/blog\/flash-decoding."},{"key":"e_1_3_2_1_53_1","volume-title":"Even better llm quantization with hadamard incoherence and lattice codebooks. arXiv preprint arXiv:2402.04396","author":"Tseng Albert","year":"2024","unstructured":"Albert Tseng, Jerry Chee, Qingyao Sun, Volodymyr Kuleshov, and Christopher De Sa. Quip#: Even better llm quantization with hadamard incoherence and lattice codebooks. arXiv preprint arXiv:2402.04396, 2024."},{"key":"e_1_3_2_1_54_1","volume-title":"Attention is all you need. Advances in neural information processing systems, 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. Attention is all you need. Advances in neural information processing systems, 30, 2017."},{"key":"e_1_3_2_1_55_1","unstructured":"BigScience Workshop Teven Le Scao Angela Fan Christopher Akiki Ellie Pavlick Suzana Ilic Daniel Hesslow Roman Castagn\u00e9 Alexandra Sasha Luccioni Fran\u00e7ois Yvon et al. Bloom: A 176bparameter open-access multilingual language model. arXiv preprint arXiv:2211.05100 2022."},{"key":"e_1_3_2_1_56_1","first-page":"38087","volume-title":"International Conference on Machine Learning","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning, pages 38087--38099. PMLR, 2023."},{"key":"e_1_3_2_1_57_1","volume-title":"vtensor: Flexible virtual tensor management for efficient llm serving. arXiv preprint arXiv:2407.15309","author":"Xu Jiale","year":"2024","unstructured":"Jiale Xu, Rui Zhang, Cong Guo, Weiming Hu, Zihan Liu, Feiyang Wu, Yu Feng, Shixuan Sun, Changxu Shao, Yuhong Guo, et al. vtensor: Flexible virtual tensor management for efficient llm serving. arXiv preprint arXiv:2407.15309, 2024."},{"key":"e_1_3_2_1_58_1","first-page":"27168","article-title":"Efficient and affordable post-training quantization for large-scale transformers","volume":"35","author":"Yao Zhewei","year":"2022","unstructured":"Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, and Yuxiong He. Zeroquant: Efficient and affordable post-training quantization for large-scale transformers. Advances in Neural Information Processing Systems, 35:27168--27183, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_59_1","volume-title":"Rptq: Reorder-based post-training quantization for large language models. arXiv preprint arXiv:2304.01089","author":"Yuan Zhihang","year":"2023","unstructured":"Zhihang Yuan, Lin Niu, Jiawei Liu, Wenyu Liu, Xinggang Wang, Yuzhang Shang, Guangyu Sun, Qiang Wu, Jiaxiang Wu, and Bingzhe Wu. Rptq: Reorder-based post-training quantization for large language models. arXiv preprint arXiv:2304.01089, 2023."},{"key":"e_1_3_2_1_60_1","volume-title":"Wkvquant: Quantizing weight and key\/value cache for large language models gains more. arXiv preprint arXiv:2402.12065","author":"Yue Yuxuan","year":"2024","unstructured":"Yuxuan Yue, Zhihang Yuan, Haojie Duanmu, Sifan Zhou, Jianlong Wu, and Liqiang Nie. Wkvquant: Quantizing weight and key\/value cache for large language models gains more. arXiv preprint arXiv:2402.12065, 2024."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1472"},{"key":"e_1_3_2_1_62_1","volume-title":"Xi Victoria Lin, et al. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068, 2022."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.20944\/preprints202310.1487.v2"},{"key":"e_1_3_2_1_64_1","volume-title":"A survey of large language models. arXiv preprint arXiv:2303.18223","author":"Zhao Wayne Xin","year":"2023","unstructured":"Wayne Xin Zhao, Kun Zhou, Junyi Li, Tianyi Tang, Xiaolei Wang, Yupeng Hou, Yingqian Min, Beichen Zhang, Junjie Zhang, Zican Dong, et al. A survey of large language models. arXiv preprint arXiv:2303.18223, 2023."},{"key":"e_1_3_2_1_65_1","first-page":"196","article-title":"Low-bit quantization for efficient and accurate llm serving","volume":"6","author":"Zhao Yilong","year":"2024","unstructured":"Yilong Zhao, Chien-Yu Lin, Kan Zhu, Zihao Ye, Lequn Chen, Size Zheng, Luis Ceze, Arvind Krishnamurthy, Tianqi Chen, and Baris Kasikci. Atom: Low-bit quantization for efficient and accurate llm serving. Proceedings of Machine Learning and Systems, 6:196--209, 2024.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_66_1","first-page":"193","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. {DistServe}: Disaggregating prefill and decoding for goodput-optimized large language model serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 193--210, 2024."},{"key":"e_1_3_2_1_67_1","volume-title":"et al. Nanoflow: Towards optimal large language model serving throughput. arXiv preprint arXiv:2408.12757","author":"Zhu Kan","year":"2024","unstructured":"Kan Zhu, Yilong Zhao, Liangyu Zhao, Gefei Zuo, Yile Gu, Dedong Xie, Yufei Gao, Qinyu Xu, Tian Tang, Zihao Ye, et al. Nanoflow: Towards optimal large language model serving throughput. arXiv preprint arXiv:2408.12757, 2024."}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3716252","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676641.3716252","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:09:47Z","timestamp":1755774587000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3716252"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":67,"alternative-id":["10.1145\/3676641.3716252","10.1145\/3676641"],"URL":"https:\/\/doi.org\/10.1145\/3676641.3716252","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}