{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T08:10:29Z","timestamp":1768983029340,"version":"3.49.0"},"reference-count":26,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:00:00Z","timestamp":1721001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:00:00Z","timestamp":1721001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,7,15]]},"DOI":"10.1109\/icme57554.2024.10688089","type":"proceedings-article","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T17:24:16Z","timestamp":1727717056000},"page":"1-6","source":"Crossref","is-referenced-by-count":5,"title":["Integer or Floating Point? New Outlooks for Low-Bit Quantization on Large Language Models"],"prefix":"10.1109","author":[{"given":"Yijia","family":"Zhang","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"given":"Lingran","family":"Zhao","sequence":"additional","affiliation":[{"name":"Peking University"}]},{"given":"Shijie","family":"Cao","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia"}]},{"given":"Sicheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"given":"Wenqiang","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"given":"Ting","family":"Cao","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia"}]},{"given":"Fan","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia"}]},{"given":"Mao","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia"}]},{"given":"Shanghang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Peking University"}]},{"given":"Ningyi","family":"Xu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Vima: General robot manipulation with multimodal prompts","author":"Jiang","year":"2022"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME55011.2023.00014"},{"key":"ref3","article-title":"Model compression via distillation and quantization","author":"Polino","year":"2018"},{"key":"ref4","article-title":"Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding","author":"Han","year":"2015"},{"key":"ref5","article-title":"Integer quantization for deep learning inference: Principles and empirical evaluation","author":"Wu","year":"2020"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00286"},{"key":"ref7","article-title":"Llm. int8 (): 8-bit matrix multiplication for transformers at scale","author":"Dettmers","year":"2022"},{"key":"ref8","article-title":"Smoothquant: Accurate and efficient post-training quantization for large language models","author":"Xiao","year":"2022"},{"key":"ref9","article-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers","author":"Frantar","year":"2022"},{"key":"ref10","article-title":"Fp8 quantization: The power of the exponent","author":"Kuzmin","year":"2022"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42613.2021.9365791"},{"key":"ref12","article-title":"NVIDIA H100 Tensor Core GPU Architecture"},{"key":"ref13","first-page":"7750","article-title":"The case for 4-bit precision: k-bit inference scaling laws","volume-title":"International Conference on Machine Learning","author":"Dettmers"},{"key":"ref14","article-title":"Awq: Activation-aware weight quantization for llm compression and acceleration","author":"Lin","year":"2023"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00035"},{"key":"ref17","article-title":"A survey of quantization methods for efficient neural network inference","author":"Gholami","year":"2021"},{"key":"ref18","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref19","article-title":"Opt: Open pre-trained transformer language models","author":"Zhang","year":"2022"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1144"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6239"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1472"},{"key":"ref23","article-title":"The wikitext long term dependency language modeling dataset","volume":"9","author":"Merity","year":"2016","journal-title":"Salesforce Metamind"},{"key":"ref24","article-title":"PPQ Library"},{"key":"ref25","article-title":"GPTQ Library"},{"key":"ref26","article-title":"FasterTransformer Library"}],"event":{"name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","location":"Niagara Falls, ON, Canada","start":{"date-parts":[[2024,7,15]]},"end":{"date-parts":[[2024,7,19]]}},"container-title":["2024 IEEE International Conference on Multimedia and Expo (ICME)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10685847\/10687354\/10688089.pdf?arnumber=10688089","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T06:23:11Z","timestamp":1727763791000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10688089\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,15]]},"references-count":26,"URL":"https:\/\/doi.org\/10.1109\/icme57554.2024.10688089","relation":{},"subject":[],"published":{"date-parts":[[2024,7,15]]}}}