{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:13:49Z","timestamp":1763190829572,"version":"3.45.0"},"reference-count":45,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11228337","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["Understanding the Difficulty of Low-Precision Post-Training Quantization for LLMs"],"prefix":"10.1109","author":[{"given":"Zifei","family":"Xu","sequence":"first","affiliation":[{"name":"d-Matrix,Santa Clara,USA"}]},{"given":"Sayeh","family":"Sharify","sequence":"additional","affiliation":[{"name":"d-Matrix,Santa Clara,USA"}]},{"given":"Wanzin","family":"Yazar","sequence":"additional","affiliation":[{"name":"d-Matrix,Santa Clara,USA"}]},{"given":"Tristan","family":"Webb","sequence":"additional","affiliation":[{"name":"d-Matrix,Santa Clara,USA"}]},{"given":"Xin","family":"Wang","sequence":"additional","affiliation":[{"name":"d-Matrix,Santa Clara,USA"}]}],"member":"263","reference":[{"journal-title":"Estimating or propagating gradients through stochastic neurons for con ditional computation","year":"2013","author":"Bengio","key":"ref1"},{"key":"ref2","article-title":"QuIP: 2-bit quantization of large language models with guarantees","volume":"36","author":"Chee","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"journal-title":"LLM. int8 (): 8-bit matrix multiplication for transformers at scale","year":"2022","author":"Dettmers","key":"ref3"},{"journal-title":"SpQR: A sparse-quantized representation for near-lossless llm weight compression","year":"2023","author":"Dettmers","key":"ref4"},{"key":"ref5","article-title":"QLoRA: Efficient finetuning of quantized llms","volume":"36","author":"Dettmers","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"Documenting large webtext corpora: A case study on the colossal clean crawled corpus","year":"2021","author":"Dodge","key":"ref6"},{"journal-title":"The difficulty of training sparse neural networks","year":"2020","author":"Evci","key":"ref7"},{"key":"ref8","first-page":"4475","article-title":"Optimal brain compression: A framework for accurate post-training quantization and pruning","volume":"35","author":"Frantar","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"journal-title":"GPTQ accurate post-training quantization for generative pre-trained transformers","year":"2022","author":"Frantar","key":"ref9"},{"article-title":"OPTQ: Accurate quantization for generative pre-trained transformers","volume-title":"The Eleventh International Conference on Learning Representations","author":"Frantar","key":"ref10"},{"journal-title":"An investigation into neural net optimization via hessian eigenvalue density","year":"2019","author":"Ghorbani","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2205597"},{"journal-title":"LoRA: Low-rank adaptation of large language models","year":"2021","author":"Hu","key":"ref13"},{"journal-title":"BiLLM: Pushing the limit of post-training quantization for llms","year":"2024","author":"Huang","key":"ref14"},{"journal-title":"L4Q: Parameter efficient quantization-aware training on large language models via lora-wise lsq","year":"2024","author":"Jeon","key":"ref15"},{"key":"ref16","article-title":"Memory-efficient fine-tuning of compressed large language models via sub-4-bit integer quantization","volume":"36","author":"Kim","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"journal-title":"Squeezellm: Dense-and-sparse quantization","year":"2023","author":"Kim","key":"ref17"},{"journal-title":"Finequant: Unlocking efficiency with fine-grained weight-only quantization for llms","year":"2023","author":"Kim","key":"ref18"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i12.29237"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"journal-title":"Visualizing the loss landscape of neural nets","year":"2018","author":"Li","key":"ref21"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.353"},{"journal-title":"LoftQ: Lora-fine-tuning-aware quantization for large language models","year":"2023","author":"Li","key":"ref23"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3714983.3714987"},{"key":"ref25","first-page":"1950","article-title":"Few-shot parameter-efficient fine-tuning is better and cheaper than in-context learning","volume":"35","author":"Liu","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2023.08.012"},{"article-title":"Decoupled weight decay regularization","year":"2017","author":"Loshchilov","key":"ref27"},{"journal-title":"The era of 1-bit llms: All large language models are in 1.58 bits","year":"2024","author":"Ma","key":"ref28"},{"journal-title":"Pointer sentinel mixture models","year":"2016","author":"Merity","key":"ref29"},{"volume-title":"The lambada dataset: Word prediction requiring a broad discourse context","year":"2016","author":"Paperno","key":"ref30"},{"journal-title":"Lut-gemm: Quantized matrix multiplication based on luts for efficient inference in large-scale generative language models","year":"2022","author":"Park","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.410"},{"issue":"8","key":"ref33","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"journal-title":"Eigenvalues of the hessian in deep learning: Singularity and beyond","year":"2017","author":"Sagun","key":"ref34"},{"journal-title":"Empirical analysis of the hessian of over-parametrized neural networks","year":"2018","author":"Sagun","key":"ref35"},{"journal-title":"PB-LLM: Partially binarized large language models","year":"2023","author":"Shang","key":"ref36"},{"article-title":"Llama 2: Open foundation and fine-tuned chat models","year":"2023","author":"Touvron","key":"ref37"},{"journal-title":"BitNet: Scaling 1-bit transformers for large language models","year":"2023","author":"Wang","key":"ref38"},{"article-title":"Weaver: Foundation models for creative writing","year":"2024","author":"Wang","key":"ref39"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/JAS.2023.123618"},{"key":"ref41","first-page":"38087","article-title":"SmoothQuant: Accurate and efficient post-training quantization for large language models","volume-title":"International Conference on Machine Learning","author":"Xiao"},{"journal-title":"A paradigm shift in machine translation: Boosting translation performance of large language models","year":"2023","author":"Xu","key":"ref42"},{"key":"ref43","first-page":"27168","article-title":"Zeroquant: Efficient and affordable post-training quantization for large-scale transformers","volume":"35","author":"Yao","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"OPT: Open pre-trained transformer language models","year":"2022","author":"Zhang","key":"ref44"},{"journal-title":"Building emotional support chatbots in the era of llms","year":"2023","author":"Zheng","key":"ref45"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11228337.pdf?arnumber=11228337","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:12:05Z","timestamp":1763190725000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11228337\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":45,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11228337","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}