{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T18:30:50Z","timestamp":1779906650639,"version":"3.53.1"},"reference-count":44,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:00:00Z","timestamp":1721001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:00:00Z","timestamp":1721001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,7,15]]},"DOI":"10.1109\/jcc62314.2024.00017","type":"proceedings-article","created":{"date-parts":[[2024,9,26]],"date-time":"2024-09-26T13:41:23Z","timestamp":1727358083000},"page":"61-67","source":"Crossref","is-referenced-by-count":6,"title":["FP4-Quantization: Lossless 4bit Quantization for Large Language Models"],"prefix":"10.1109","author":[{"given":"Jie","family":"Wang","sequence":"first","affiliation":[{"name":"National University of Defense Technology,College of Computer,Changsha,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Huanxi","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,College of Computer,Changsha,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dawei","family":"Feng","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,College of Computer,Changsha,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jie","family":"Ding","sequence":"additional","affiliation":[{"name":"Iflytek,R&#x0026;D Group,Hefei,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bo","family":"Ding","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,College of Computer,Changsha,China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gpt-4 technical report[J]","author":"Achiam","year":"2023"},{"key":"ref2","article-title":"LLaMA 2: Open foundation and fine-tuned chat models[J]","author":"Touvron","year":"2023"},{"key":"ref3","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020b","journal-title":"Advances in neural information processing systems"},{"key":"ref4","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Brown","year":"2020a"},{"key":"ref5","article-title":"Llm.int8(): 8-bit matrix multiplication for transformers at scale","author":"Dettmers","year":"2022"},{"key":"ref6","first-page":"38087","article-title":"Smoothquant: Accurate and efficient post-training quantization for large language models[C]","volume-title":"International Conference on Machine Learning","author":"Xiao"},{"key":"ref7","first-page":"27168","article-title":"Zero-quant: Efficient and affordable post-training quantization for large-scale transformers[J]","volume":"35","author":"Yao","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref8","article-title":"Zeroquant-fp: A leap forward in llms post-training w4a8 quantization using floating-point formats[J]","author":"Wu","year":"2023"},{"key":"ref9","first-page":"17402","article-title":"Outlier suppression: Pushing the limit of low-bit transformer language models[J]","volume":"35","author":"Wei","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.102"},{"key":"ref11","article-title":"Owq: Lessons learned from activation outliers for weight quantization in large language models[J]","author":"Lee","year":"2023"},{"key":"ref12","article-title":"Awq: Activation-aware weight quantization for llm compression and acceleration[J]","author":"Lin","year":"2023"},{"key":"ref13","article-title":"Spqr: A sparse-quantized representation for near-lossless llm weight compression[J]","author":"Dettmers","year":"2023"},{"key":"ref14","article-title":"Omniquant: Omnidirectionally calibrated quantization for large language models[J]","author":"Shao","year":"2023"},{"key":"ref15","article-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers[J]","author":"Frantar","year":"2022"},{"key":"ref16","article-title":"Qllm: Accurate and efficient low-bitwidth quantization for large language models[J]","author":"Liu","year":"2023"},{"key":"ref17","first-page":"36","article-title":"Quip: 2-bit quantization of large language models with guarantees[J]","author":"Chee","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref18","article-title":"BiLLM: Pushing the Limit of Post-Training Quantization for LLMs[J]","author":"Huang","year":"2024"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1002\/cpa.21442"},{"key":"ref20","article-title":"SmoothQuant+: Accurate and Efficient 4-bit Post-Training WeightQuantization for LLM[J]","author":"Pan","year":"2023"},{"key":"ref21","article-title":"Lora: Low-rank adaptation of large language models[J]","author":"Hu","year":"2021"},{"key":"ref22","article-title":"Lq-lora: Low-rank plus quantized matrix decomposition for efficient language model finetuning[J]","author":"Guo","year":"2023"},{"key":"ref23","article-title":"Qa-lora: Quantization-aware low-rank adaptation of large language models[J]","author":"Xu","year":"2023"},{"key":"ref24","article-title":"Loftq: Lora-fine-tuning-aware quantization for large language models[J]","author":"Li","year":"2023"},{"key":"ref25","article-title":"LQER: Low-Rank Quantization Error Reconstruction for LLMs[J]","author":"Zhang","year":"2024"},{"key":"ref26","article-title":"Fp8 formats for deep learning[J]","author":"Micikevicius","year":"2022"},{"key":"ref27","article-title":"Opt: Open pre-trained transformer language models[J]","author":"Zhang","year":"2022"},{"key":"ref28","article-title":"Bloom: A 176b-parameter open-access multilingual language model[J]","author":"Le Scao","year":"2022"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p16-1144"},{"key":"ref30","article-title":"BoolQ: Exploring the surprising difficulty of natural yes\/no questions[J]","author":"Clark","year":"2019"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1454"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6239"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6399"},{"key":"ref34","article-title":"Measuring massive multitask language understanding","volume-title":"CoRR","volume":"abs\/2009.03300","author":"Hendrycks","year":"2020"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1260"},{"key":"ref36","author":"Merity","year":"2016","journal-title":"Pointer sentinel mixture models"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21236\/ADA273556"},{"key":"ref39","article-title":"Decoupled weight decay regularization","volume-title":"ICLR","author":"Loshchilov"},{"key":"ref40","article-title":"Scaling language models: Methods, analysis & insights from training gopher","author":"Rae","year":"2021"},{"key":"ref41","article-title":"Using deep-speed and megatron to train megatron-turing nlg 530b, a large-scale generative language model[J]","author":"Smith","year":"2022"},{"key":"ref42","first-page":"5547","article-title":"Glam: Efficient scaling of language models with mixture-of-experts[C]","volume-title":"International Conference on Machine Learning","author":"Du"},{"issue":"240","key":"ref43","first-page":"1","article-title":"Palm: Scaling language modeling with pathways[J]","volume":"24","author":"Chowdhery","year":"2023","journal-title":"Journal of Machine Learning Research"},{"key":"ref44","first-page":"14651","article-title":"Fp8 quantization: The power of the exponent[J]","volume":"35","author":"Kuzmin","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref45","article-title":"FP8-BERT: Post-Training Quantization for Transformer[J]","author":"Li","year":"2023"}],"event":{"name":"2024 IEEE International Conference on Joint Cloud Computing (JCC)","location":"Shanghai, China","start":{"date-parts":[[2024,7,15]]},"end":{"date-parts":[[2024,7,18]]}},"container-title":["2024 IEEE International Conference on Joint Cloud Computing (JCC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10685160\/10685165\/10685437.pdf?arnumber=10685437","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T04:38:10Z","timestamp":1769488690000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10685437\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,15]]},"references-count":44,"URL":"https:\/\/doi.org\/10.1109\/jcc62314.2024.00017","relation":{},"subject":[],"published":{"date-parts":[[2024,7,15]]}}}