{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T01:48:40Z","timestamp":1773193720082,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":19,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,23]],"date-time":"2024-06-23T00:00:00Z","timestamp":1719100800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Shenzhen Science and Technology Program","award":["KQTD20200820113051096"],"award-info":[{"award-number":["KQTD20200820113051096"]}]},{"name":"Science and Technology Innovation Committee Foundation of Shenzhen","award":["JCYJ20220818100217038"],"award-info":[{"award-number":["JCYJ20220818100217038"]}]},{"name":"Theme-based Research Scheme (TRS) project","award":["T45-701\/22-R"],"award-info":[{"award-number":["T45-701\/22-R"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,23]]},"DOI":"10.1145\/3649329.3658498","type":"proceedings-article","created":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T19:27:22Z","timestamp":1731007642000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":18,"title":["APTQ: Attention-aware Post-Training Mixed-Precision Quantization for Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9866-6588","authenticated-orcid":false,"given":"Ziyi","family":"Guan","sequence":"first","affiliation":[{"name":"Department of Electrical and Electronic Engineering, The University of Hong Kong, Hong Kong, Hong Kong"},{"name":"School of Microelectronics, Southern University of Science and Technology, ShenZhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2533-2082","authenticated-orcid":false,"given":"Hantao","family":"Huang","sequence":"additional","affiliation":[{"name":"School of Microelectronics, Southern University of Science and Technology, ShenZhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0575-4038","authenticated-orcid":false,"given":"Yupeng","family":"Su","sequence":"additional","affiliation":[{"name":"School of Microelectronics, Southern University of Science and Technology, ShenZhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2448-169X","authenticated-orcid":false,"given":"Hong","family":"Huang","sequence":"additional","affiliation":[{"name":"School of Microelectronics, Southern University of Science and Technology, ShenZhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3026-0108","authenticated-orcid":false,"given":"Ngai","family":"Wong","sequence":"additional","affiliation":[{"name":"Department of Electrical and Electronic Engineering, The University of Hong Kong, Hong Kong, Hong Kong Special Administrative Region of China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2674-4118","authenticated-orcid":false,"given":"Hao","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Microelectronics, Southern University of Science and Technology, ShenZhen, Guangdong, China"}]}],"member":"320","published-online":{"date-parts":[[2024,11,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Miguel A Carreira-Perpin\u00e1n and Yerlan Idelbayev. 2018. \"learning-compression\" algorithms for neural net pruning. In IEEE CVPR. 8532--8541.","DOI":"10.1109\/CVPR.2018.00890"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01396-x"},{"key":"e_1_3_2_1_3_1","volume-title":"SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression. arXiv preprint arXiv:2306.03078","author":"Dettmers Tim","year":"2023","unstructured":"Tim Dettmers, Ruslan Svirschevski, Vage Egiazarian, Denis Kuznedelev, Elias Frantar, Saleh Ashkboos, Alexander Borzunov, Torsten Hoefler, and Dan Alistarh. 2023. SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression. arXiv preprint arXiv:2306.03078 (2023)."},{"key":"e_1_3_2_1_4_1","first-page":"18518","article-title":"Hawq-v2: Hessian aware trace-weighted quantization of neural networks","volume":"33","author":"Dong Zhen","year":"2020","unstructured":"Zhen Dong, Zhewei Yao, Daiyaan Arfeen, Amir Gholami, Michael W Mahoney, and Kurt Keutzer. 2020. Hawq-v2: Hessian aware trace-weighted quantization of neural networks. NIPS 33 (2020), 18518--18529.","journal-title":"NIPS"},{"key":"e_1_3_2_1_5_1","first-page":"4475","article-title":"Optimal brain compression: A framework for accurate post-training quantization and pruning","volume":"35","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar and Dan Alistarh. 2022. Optimal brain compression: A framework for accurate post-training quantization and pruning. NeurIPS 35 (2022), 4475--4488.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_6_1","volume-title":"GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers. ICLR","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2023. GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers. ICLR (2023)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","unstructured":"Leo Gao Jonathan Tow Stella Biderman Charles Lovering Jason Phang Anish Thite Fazz Niklas Muennighoff and et al. 2022. EleutherAI\/lm-evaluation-harness: v0.3.0. 10.5281\/zenodo.7413426","DOI":"10.5281\/zenodo.7413426"},{"key":"e_1_3_2_1_8_1","volume-title":"SqueezeLLM: Dense-and-Sparse Quantization. arXiv preprint arXiv:2306.07629","author":"Kim Sehoon","year":"2023","unstructured":"Sehoon Kim, Coleman Hooper, Amir Gholami, Zhen Dong, Xiuyu Li, Sheng Shen, Michael W Mahoney, and Kurt Keutzer. 2023. SqueezeLLM: Dense-and-Sparse Quantization. arXiv preprint arXiv:2306.07629 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Optimal brain damage. NeurIPS 2","author":"LeCun Yann","year":"1989","unstructured":"Yann LeCun, John Denker, and Sara Solla. 1989. Optimal brain damage. NeurIPS 2 (1989)."},{"key":"e_1_3_2_1_10_1","volume-title":"OWQ: Lessons learned from activation outliers for weight quantization in large language models. arXiv preprint arXiv:2306.02272","author":"Lee Changhun","year":"2023","unstructured":"Changhun Lee, Jungyu Jin, Taesu Kim, Hyungjun Kim, and Eunhyeok Park. 2023. OWQ: Lessons learned from activation outliers for weight quantization in large language models. arXiv preprint arXiv:2306.02272 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"LLM-FP4: 4-Bit Floating-Point Quantized Transformers. arXiv preprint arXiv:2310.16836","author":"Liu Zechun","year":"2023","unstructured":"Shih-yang Liu, Zechun Liu, Xijie Huang, Pingcheng Dong, and Kwang-Ting Cheng. 2023. LLM-FP4: 4-Bit Floating-Point Quantized Transformers. arXiv preprint arXiv:2310.16836 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"LLM-QAT: Data-Free Quantization Aware Training for Large Language Models. arXiv preprint arXiv:2305.17888","author":"Liu Zechun","year":"2023","unstructured":"Zechun Liu, Barlas Oguz, Changsheng Zhao, Ernie Chang, Pierre Stock, Yashar Mehdad, Yangyang Shi, Raghuraman Krishnamoorthi, and Vikas Chandra. 2023. LLM-QAT: Data-Free Quantization Aware Training for Large Language Models. arXiv preprint arXiv:2305.17888 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Pointer sentinel mixture models. arXiv preprint arXiv:1609.07843","author":"Merity Stephen","year":"2016","unstructured":"Stephen Merity, Caiming Xiong, James Bradbury, and Richard Socher. 2016. Pointer sentinel mixture models. arXiv preprint arXiv:1609.07843 (2016)."},{"key":"e_1_3_2_1_14_1","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. 2022. Training language models to follow instructions with human feedback. NeurIPS 35 (2022), 27730--27744.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_1_16_1","volume-title":"PB-LLM: Partially Binarized Large Language Models. arXiv preprint arXiv:2310.00034","author":"Shang Yuzhang","year":"2023","unstructured":"Yuzhang Shang, Zhihang Yuan, Qiang Wu, and Zhen Dong. 2023. PB-LLM: Partially Binarized Large Language Models. arXiv preprint arXiv:2310.00034 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"Smoothquant: Accurate and efficient post-training quantization for large language models. In ICML. 38087--38099.","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In ICML. 38087--38099."},{"key":"e_1_3_2_1_19_1","volume-title":"Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, et al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."}],"event":{"name":"DAC '24: 61st ACM\/IEEE Design Automation Conference","location":"San Francisco CA USA","acronym":"DAC '24","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE-CEDA","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 61st ACM\/IEEE Design Automation Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3658498","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3649329.3658498","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:01Z","timestamp":1750295881000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3658498"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,23]]},"references-count":19,"alternative-id":["10.1145\/3649329.3658498","10.1145\/3649329"],"URL":"https:\/\/doi.org\/10.1145\/3649329.3658498","relation":{},"subject":[],"published":{"date-parts":[[2024,6,23]]},"assertion":[{"value":"2024-11-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}