{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T11:32:17Z","timestamp":1763724737807,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,23]],"date-time":"2024-06-23T00:00:00Z","timestamp":1719100800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,23]]},"DOI":"10.1145\/3649329.3656221","type":"proceedings-article","created":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T19:27:22Z","timestamp":1731007642000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Oltron: Algorithm-Hardware Co-design for Outlier-Aware Quantization of LLMs with Inter-\/Intra-Layer Adaptation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9721-247X","authenticated-orcid":false,"given":"Chenhao","family":"Xue","sequence":"first","affiliation":[{"name":"School of Integrated Circuits, Peking University, Beijing, Beijing, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2762-2726","authenticated-orcid":false,"given":"Chen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7958-8485","authenticated-orcid":false,"given":"Xun","family":"Jiang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Peking University, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1013-5781","authenticated-orcid":false,"given":"Zhutianya","family":"Gao","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, ShangHai, ShangHai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0977-2774","authenticated-orcid":false,"given":"Yibo","family":"Lin","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Peking University, Beijing, Beijing, China"},{"name":"Institute of Electronic Design Automation, Peking University, Wuxi, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7315-6589","authenticated-orcid":false,"given":"Guangyu","family":"Sun","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Peking University, Beijing, Beijing, China"},{"name":"Institute of Electronic Design Automation, Peking University, Wuxi, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,11,7]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"1877","article-title":"2020. Language models are few-shot learners","volume":"33","author":"Tom Brown","year":"2020","unstructured":"Tom Brown et al. 2020. Language models are few-shot learners. NIPS 33 (2020), 1877--1901.","journal-title":"NIPS"},{"key":"e_1_3_2_1_2_1","unstructured":"Tim Dettmers et al. 2022. Llm. int8 (): 8-bit matrix multiplication for transformers at scale. arXiv preprint arXiv:2208.07339 (2022)."},{"key":"e_1_3_2_1_3_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Elias Frantar","year":"2022","unstructured":"Elias Frantar et al. 2022. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_4_1","volume-title":"Ant: Exploiting adaptive numerical data type for low-bit deep neural network quantization","author":"Cong Guo","year":"2022","unstructured":"Cong Guo et al. 2022. Ant: Exploiting adaptive numerical data type for low-bit deep neural network quantization. In MICRO. IEEE, 1414--1433."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Cong Guo et al. 2023. OliVe: Accelerating Large Language Models via Hardware-friendly Outlier-Victim Pair Quantization. In ISCA. 1--15.","DOI":"10.1145\/3579371.3589038"},{"key":"e_1_3_2_1_6_1","unstructured":"Norman P Jouppi et al. 2017. In-datacenter performance analysis of a tensor processing unit. In ISCA. 1--12."},{"key":"e_1_3_2_1_7_1","unstructured":"Pran Kurup et al. 2012. Logic synthesis using Synopsys\u00ae. Springer Science & Business Media."},{"key":"e_1_3_2_1_8_1","unstructured":"Stephen Merity et al. 2016. Pointer sentinel mixture models. arXiv preprint arXiv:1609.07843 (2016)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Naveen Muralimanohar et al. 2009. CACTI 6.0: A tool to model large caches. HP laboratories 27 (2009) 28.","DOI":"10.1109\/MM.2008.2"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Eunhyeok Park et al. 2018. Energy-efficient neural network accelerator based on outlier-aware low-precision computation. In ISCA. IEEE 688--698.","DOI":"10.1109\/ISCA.2018.00063"},{"key":"e_1_3_2_1_11_1","first-page":"1","article-title":"2020. Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Colin Raffel","year":"2020","unstructured":"Colin Raffel et al. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. J Mach Learn Res 21, 1 (2020), 5485--5551.","journal-title":"J Mach Learn Res"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Satyabrata Sarangi et al. 2021. DeepScaleTool: A tool for the accurate estimation of technology scaling in the deep-submicron era. In ISCAS. IEEE 1--5.","DOI":"10.1109\/ISCAS51556.2021.9401196"},{"key":"e_1_3_2_1_13_1","volume-title":"Omniquant: Omnidirectionally calibrated quantization for large language models. arXiv preprint arXiv:2308.13137","author":"Wenqi Shao","year":"2023","unstructured":"Wenqi Shao et al. 2023. Omniquant: Omnidirectionally calibrated quantization for large language models. arXiv preprint arXiv:2308.13137 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"Dnnweaver: From high-level deep network models to fpga acceleration. In COGARCH.","author":"Hardik Sharma","year":"2016","unstructured":"Hardik Sharma et al. 2016. Dnnweaver: From high-level deep network models to fpga acceleration. In COGARCH."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Hardik Sharma et al. 2018. Bit fusion: Bit-level dynamically composable architecture for accelerating deep neural network. In ISCA. IEEE 764--775.","DOI":"10.1109\/ISCA.2018.00069"},{"key":"e_1_3_2_1_16_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Hugo Touvron","year":"2023","unstructured":"Hugo Touvron et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_17_1","first-page":"30","article-title":"2017. Attention is all you need","author":"Ashish Vaswani","year":"2017","unstructured":"Ashish Vaswani et al. 2017. Attention is all you need. NIPS 30 (2017).","journal-title":"NIPS"},{"key":"e_1_3_2_1_18_1","unstructured":"Xiuying Wei et al. 2023. Outlier Suppression+: Accurate quantization of large language models by equivalent and optimal shifting and scaling. arXiv preprint arXiv:2304.09145 (2023)."},{"volume-title":"The Free Encyclopedia. [Online]","author":"Wikipedia Wikipedia","key":"e_1_3_2_1_19_1","unstructured":"Wikipedia contributors. 2022. 68-95-99.7 rule --- Wikipedia, The Free Encyclopedia. [Online]."},{"key":"e_1_3_2_1_20_1","volume-title":"Smoothquant: Accurate and efficient post-training quantization for large language models. In ICML. PMLR, 38087--38099.","author":"Guangxuan Xiao","year":"2023","unstructured":"Guangxuan Xiao et al. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In ICML. PMLR, 38087--38099."},{"key":"e_1_3_2_1_21_1","first-page":"27168","article-title":"2022. Zeroquant: Efficient and affordable post-training quantization for large-scale transformers","volume":"35","author":"Zhewei Yao","year":"2022","unstructured":"Zhewei Yao et al. 2022. Zeroquant: Efficient and affordable post-training quantization for large-scale transformers. NIPS 35 (2022), 27168--27183.","journal-title":"NIPS"},{"key":"e_1_3_2_1_22_1","volume-title":"RPTQ: Reorder-based Post-training Quantization for Large Language Models. arXiv preprint arXiv:2304.01089","author":"Zhihang Yuan","year":"2023","unstructured":"Zhihang Yuan et al. 2023. RPTQ: Reorder-based Post-training Quantization for Large Language Models. arXiv preprint arXiv:2304.01089 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Gobo: Quantizing attention-based nlp models for low latency and energy efficient inference","author":"Zadeh Ali Hadi","year":"2020","unstructured":"Ali Hadi Zadeh et al. 2020. Gobo: Quantizing attention-based nlp models for low latency and energy efficient inference. In MICRO. IEEE, 811--824."},{"key":"e_1_3_2_1_24_1","volume-title":"Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068","author":"Susan Zhang","year":"2022","unstructured":"Susan Zhang et al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."}],"event":{"name":"DAC '24: 61st ACM\/IEEE Design Automation Conference","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE-CEDA","SIGBED ACM Special Interest Group on Embedded Systems"],"location":"San Francisco CA USA","acronym":"DAC '24"},"container-title":["Proceedings of the 61st ACM\/IEEE Design Automation Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3656221","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3649329.3656221","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:55Z","timestamp":1750295875000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3656221"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,23]]},"references-count":24,"alternative-id":["10.1145\/3649329.3656221","10.1145\/3649329"],"URL":"https:\/\/doi.org\/10.1145\/3649329.3656221","relation":{},"subject":[],"published":{"date-parts":[[2024,6,23]]},"assertion":[{"value":"2024-11-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}