{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T16:38:27Z","timestamp":1778258307272,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T00:00:00Z","timestamp":1729987200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,27]]},"DOI":"10.1145\/3676536.3676796","type":"proceedings-article","created":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T12:53:56Z","timestamp":1744203236000},"page":"1-9","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Fast and Efficient 2-bit LLM Inference on GPU: 2\/4\/16-bit in a Weight Matrix with Asynchronous Dequantization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-4286-6359","authenticated-orcid":false,"given":"Jinhao","family":"Li","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7000-6537","authenticated-orcid":false,"given":"Jiaming","family":"Xu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University &amp; Infinigence-AI, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4755-6881","authenticated-orcid":false,"given":"Shiyao","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua University &amp; Infinigence-AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2012-8540","authenticated-orcid":false,"given":"Shan","family":"Huang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8280-9072","authenticated-orcid":false,"given":"Jun","family":"Liu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7858-5132","authenticated-orcid":false,"given":"Yaoxiu","family":"Lian","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0849-3252","authenticated-orcid":false,"given":"Guohao","family":"Dai","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University &amp; Infinigence-AI, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,4,9]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Piqa: Reasoning about physical commonsense in natural language. In AAAI.","author":"Bisk Yonatan","year":"2020","unstructured":"Yonatan Bisk, Rowan Zellers, et al. 2020. Piqa: Reasoning about physical commonsense in natural language. In AAAI."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01004"},{"key":"e_1_3_2_1_3_1","unstructured":"Peter Clark Isaac Cowhey et al. 2018. Think you have solved question answering? try arc the ai2 reasoning challenge. arXiv preprint arXiv:1803.05457 (2018)."},{"key":"e_1_3_2_1_4_1","volume-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness. NeurIPS","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, et al. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. NeurIPS (2022)."},{"key":"e_1_3_2_1_5_1","unstructured":"Tim Dettmers et al. 2023. SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression. arXiv preprint arXiv:2306.03078 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3596490"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAES.1976.308294"},{"key":"e_1_3_2_1_9_1","unstructured":"Elias Frantar and Dan Alistarh. 2023. SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot. (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_11_1","volume-title":"Average Price: Electricity per Kilowatt-Hour in U.S. City Average. https:\/\/fred.stlouisfed.org\/series\/APU000072610.","author":"FRED.","year":"2024","unstructured":"FRED. 2024. Average Price: Electricity per Kilowatt-Hour in U.S. City Average. https:\/\/fred.stlouisfed.org\/series\/APU000072610."},{"key":"e_1_3_2_1_12_1","volume-title":"APTQ: Attention-aware Post-Training Mixed-Precision Quantization for Large Language Models. arXiv preprint arXiv:2402.14866","author":"Guan Ziyi","year":"2024","unstructured":"Ziyi Guan, Hantao Huang, Yupeng Su, Hong Huang, Ngai Wong, and Hao Yu. 2024. APTQ: Attention-aware Post-Training Mixed-Precision Quantization for Large Language Models. arXiv preprint arXiv:2402.14866 (2024)."},{"key":"e_1_3_2_1_13_1","unstructured":"Nianhui Guo et al. 2023. Advanced Ultra-Low Bitrate Compression Techniques for the LLaMA Family of LLMs. https:\/\/github.com\/GreenBitAI\/low_bit_llama (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"BiLLM: Pushing the Limit of Post-Training Quantization for LLMs. arXiv preprint arXiv:2402.04291","author":"Huang Wei","year":"2024","unstructured":"Wei Huang, Yangdong Liu, Haotong Qin, Ying Li, Shiming Zhang, Xianglong Liu, Michele Magno, and Xiaojuan Qi. 2024. BiLLM: Pushing the Limit of Post-Training Quantization for LLMs. arXiv preprint arXiv:2402.04291 (2024)."},{"key":"e_1_3_2_1_15_1","unstructured":"HuggingFace. 2024. https:\/\/huggingface.co\/."},{"key":"e_1_3_2_1_16_1","first-page":"160","article-title":"Abstractive Long Text Summarization Using Large Language Models","volume":"12","author":"Keswani Gunjan","year":"2024","unstructured":"Gunjan Keswani, Wani Bisen, Hirkani Padwad, Yash Wankhedkar, Sudhanshu Pandey, and Ayushi Soni. 2024. Abstractive Long Text Summarization Using Large Language Models. International Journal of Intelligent Systems and Applications in Engineering 12, 12s (2024), 160--168.","journal-title":"International Journal of Intelligent Systems and Applications in Engineering"},{"key":"e_1_3_2_1_17_1","unstructured":"Sehoon Kim Coleman Hooper et al. 2023. SqueezeLLM: Dense-and-Sparse Quantization. arXiv preprint arXiv:2306.07629 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"The Efficient Natural Language and Speech Processing Workshop with NeurIPS","volume":"9","author":"Li Shiyao","year":"2023","unstructured":"Shiyao Li, Xuefei Ning, Ke Hong, Tengxuan Liu, Luning Wang, Xiuhong Li, Kai Zhong, Guohao Dai, Huazhong Yang, and Yu Wang. 2023. Llm-mq: Mixed-precision quantization for efficient llm deployment. In The Efficient Natural Language and Speech Processing Workshop with NeurIPS, Vol. 9."},{"key":"e_1_3_2_1_19_1","first-page":"87","article-title":"AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration","volume":"6","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. 2024. AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration. Proceedings of Machine Learning and Systems 6 (2024), 87--100.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_20_1","volume-title":"Yuyao Wang, and Lingming Zhang.","author":"Liu Jiawei","year":"2024","unstructured":"Jiawei Liu, Chunqiu Steven Xia, Yuyao Wang, and Lingming Zhang. 2024. Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_21_1","unstructured":"Zechun Liu Barlas Oguz et al. 2023. LLM-QAT: Data-Free Quantization Aware Training for Large Language Models. arXiv preprint arXiv:2305.17888 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1002\/asi.24750"},{"key":"e_1_3_2_1_23_1","unstructured":"Stephen Merity Caiming Xiong et al. 2016. Pointer sentinel mixture models. arXiv preprint arXiv:1609.07843 (2016)."},{"key":"e_1_3_2_1_24_1","unstructured":"Meta. 2024. Build the future of AI with Meta Llama 3. https:\/\/llama.meta.com\/llama3\/."},{"key":"e_1_3_2_1_25_1","volume-title":"Introducing LLaMA: A foundational, 65-billion-parameter large language model. Meta AI","author":"Meta AI","year":"2023","unstructured":"AI Meta. 2023. Introducing LLaMA: A foundational, 65-billion-parameter large language model. Meta AI (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605943"},{"key":"e_1_3_2_1_27_1","unstructured":"Markus Nagel Marios Fournarakis et al. 2021. A white paper on neural network quantization. arXiv preprint arXiv:2106.08295 (2021)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-022-04052-8"},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Machine Learning.","author":"Ni Ansong","year":"2023","unstructured":"Ansong Ni, Srini Iyer, et al. 2023. Lever: Learning to verify language-to-code generation with execution. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_30_1","unstructured":"NVIDIA. 2024. NVIDIA CUDA Sparse Matrix Library. https:\/\/docs.nvidia.com\/cuda\/cusparse\/."},{"key":"e_1_3_2_1_31_1","unstructured":"NVIDIA. 2024. NVIDIA Management Library (NVML) | NVIDIA Developer. https:\/\/developer.nvidia.com\/nvidia-management-library-nvml."},{"key":"e_1_3_2_1_32_1","unstructured":"Dylan Patel and Afzal Ahmad. 2024. The Inference Cost Of Search Disruption - Large Language Model Cost Analysis. https:\/\/www.semianalysis.com\/p\/the-inference-cost-of-search-disruption."},{"key":"e_1_3_2_1_33_1","unstructured":"Baolin Peng Chunyuan Li et al. 2023. Instruction tuning with gpt-4. arXiv preprint arXiv:2304.03277 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"Ronan Le Bras, et al","author":"Sakaguchi Keisuke","year":"2021","unstructured":"Keisuke Sakaguchi, Ronan Le Bras, et al. 2021. Winogrande: An adversarial winograd schema challenge at scale. Commun. ACM (2021)."},{"key":"e_1_3_2_1_35_1","volume-title":"Are emergent abilities of large language models a mirage? Advances in Neural Information Processing Systems 36","author":"Schaeffer Rylan","year":"2024","unstructured":"Rylan Schaeffer, Brando Miranda, and Sanmi Koyejo. 2024. Are emergent abilities of large language models a mirage? Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Omniquant: Omnidirectionally calibrated quantization for large language models. arXiv preprint arXiv:2308.13137","author":"Shao Wenqi","year":"2023","unstructured":"Wenqi Shao, Mengzhao Chen, Zhaoyang Zhang, Peng Xu, Lirui Zhao, Zhiqian Li, Kaipeng Zhang, Peng Gao, Yu Qiao, and Ping Luo. 2023. Omniquant: Omnidirectionally calibrated quantization for large language models. arXiv preprint arXiv:2308.13137 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6409"},{"key":"e_1_3_2_1_38_1","unstructured":"Hugo Touvron Louis Martin et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_40_1","volume-title":"GLUE: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461","author":"Wang Alex","year":"2018","unstructured":"Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R Bowman. 2018. GLUE: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461 (2018)."},{"key":"e_1_3_2_1_41_1","unstructured":"Jason Wei Yi Tay Rishi Bommasani Colin Raffel Barret Zoph Sebastian Borgeaud Dani Yogatama Maarten Bosma Denny Zhou Donald Metzler et al. 2022. Emergent abilities of large language models. arXiv preprint arXiv:2206.07682 (2022)."},{"key":"e_1_3_2_1_42_1","volume-title":"Smoothquant: Accurate and efficient posttraining quantization for large language models. In ICML.","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, et al. 2023. Smoothquant: Accurate and efficient posttraining quantization for large language models. In ICML."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00071"},{"key":"e_1_3_2_1_44_1","volume-title":"Hellaswag: Can a machine really finish your sentence? arXiv preprint arXiv:1905.07830","author":"Zellers Rowan","year":"2019","unstructured":"Rowan Zellers, Ari Holtzman, et al. 2019. Hellaswag: Can a machine really finish your sentence? arXiv preprint arXiv:1905.07830 (2019)."},{"key":"e_1_3_2_1_45_1","unstructured":"Aohan Zeng Xiao Liu et al. 2022. Glm-130b: An open bilingual pre-trained model. arXiv preprint arXiv:2210.02414 (2022)."}],"event":{"name":"ICCAD '24: 43rd IEEE\/ACM International Conference on Computer-Aided Design","location":"Newark Liberty International Airport Marriott New York NY USA","acronym":"ICCAD '24","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE CAS","IEEE CEDA","IEEE EDS"]},"container-title":["Proceedings of the 43rd IEEE\/ACM International Conference on Computer-Aided Design"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676796","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676536.3676796","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:44Z","timestamp":1750295924000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676796"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,27]]},"references-count":45,"alternative-id":["10.1145\/3676536.3676796","10.1145\/3676536"],"URL":"https:\/\/doi.org\/10.1145\/3676536.3676796","relation":{},"subject":[],"published":{"date-parts":[[2024,10,27]]},"assertion":[{"value":"2025-04-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}