{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T19:05:18Z","timestamp":1776366318855,"version":"3.51.2"},"reference-count":51,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T00:00:00Z","timestamp":1759968000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T00:00:00Z","timestamp":1759968000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,9]]},"DOI":"10.1109\/dsaa65442.2025.11247985","type":"proceedings-article","created":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T18:56:45Z","timestamp":1764010605000},"page":"1-10","source":"Crossref","is-referenced-by-count":2,"title":["Small and Fast LLMs on Commodity Hardware: Post-Training Quantization in llama. cpp"],"prefix":"10.1109","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9450-7387","authenticated-orcid":false,"given":"Lorenz","family":"Sparrenberg","sequence":"first","affiliation":[{"name":"University of Bonn,Bonn,Germany"}]},{"given":"Tobias","family":"Deu\u00dfer","sequence":"additional","affiliation":[{"name":"University of Bonn,Bonn,Germany"}]},{"given":"Armin","family":"Berger","sequence":"additional","affiliation":[{"name":"University of Bonn,Bonn,Germany"}]},{"given":"Rafet","family":"Sifa","sequence":"additional","affiliation":[{"name":"University of Bonn,Bonn,Germany"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Llama: Open and efficient foundation language models","volume-title":"ArXiv, vol. abs\/2302.13971","author":"Touvron","year":"2023"},{"key":"ref2","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Advances in neural information processing systems","volume":"33","author":"Brown","year":"2020"},{"issue":"1","key":"ref3","article-title":"Palm: scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"Journal of Machine Learning Research"},{"key":"ref4","article-title":"Efficient large language models: A survey","volume-title":"Transactions on Machine Learning Research","author":"Wan","year":"2024"},{"key":"ref5","doi-asserted-by":"crossref","DOI":"10.2139\/ssrn.4996660","article-title":"A survey of low-bit large language models: Basics, systems, and algorithms","volume-title":"ArXiv","author":"Gong","year":"2024"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/bigdata62323.2024.10825055"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/bigdata59044.2023.10386518"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/bigdata62323.2024.10825458"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/bigdata62323.2024.10825695"},{"key":"ref10","article-title":"A white paper on neural network quantization","volume-title":"ArXiv","author":"Nagel","year":"2021"},{"key":"ref11","volume-title":"llama.cpp: Inference of llama model in pure c\/c++. GitHub.","year":"2023"},{"key":"ref12","article-title":"Llm.int8(): 8-bit matrix multiplication for transformers at scale","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems, ser. NIPS \u201822. Red Hook","author":"Dettmers","year":"2022"},{"key":"ref13","first-page":"38087","article-title":"Smoothquant: Accurate and efficient post-training quantization for large language models","volume-title":"International Conference on Machine Learning.","author":"Xiao"},{"key":"ref14","article-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers","volume-title":"11th International Conference on Learning Representations","author":"Frantar","year":"2023"},{"key":"ref15","first-page":"87","article-title":"Awq: Activation-aware weight quantization for on-device llm compression and acceleration","volume-title":"Proceedings of Machine Learning and Systems","volume":"6","author":"Lin","year":"2024"},{"key":"ref16","first-page":"10088","article-title":"Qlora: Efficient finetuning of quantized llms","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Dettmers","year":"2023"},{"key":"ref17","first-page":"27168","article-title":"Zeroquant: Efficient and affordable post-training quantization for large-scale transformers","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Yao","year":"2022"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2018.00286"},{"key":"ref19","volume-title":"Quantization - PyTorch 2.7 documentation, The PyTorch Foundation","author":"Team","year":"2024"},{"key":"ref20","article-title":"Low rank quantization-aware training for LLMs","volume-title":"Workshop on Efficient Systems for Foundation Models II @ ICML2024","author":"Bondarenko"},{"key":"ref21","volume-title":"Quantization aware training TensorFlow - 2.19 documentation, Google","author":"Team","year":"2024"},{"key":"ref22","article-title":"Efficientqat: Efficient quantization-aware training for large language models","volume-title":"CoRR, vol. abs\/2407.11062","author":"Chen","year":"2024"},{"key":"ref23","article-title":"Mistral 7b","volume-title":"Mistral, Tech. Rep.","author":"Jiang","year":"2023"},{"key":"ref24","article-title":"Qwen technical report","volume-title":"Alibaba Cloud, Tech. Rep.","author":"Bai","year":"2023"},{"key":"ref25","article-title":"Gemma: Open models based on gemini research and technology","author":"Mesnard","year":"2024","journal-title":"CoRR, vol. abs\/2403.08295"},{"key":"ref26","volume-title":"Phi-2: The surprisingpower of small language models","author":"Abdin","year":"2023"},{"key":"ref27","article-title":"Mamba: Linear-time sequence modeling with selective state spaces","volume-title":"arXiv preprint","author":"Gu","year":"2023"},{"key":"ref28","article-title":"ggml-quants.c and ggml-quants.h","volume-title":"GitHub, 2023\u20132024, source code for quantization routines.","author":"Gerganov","year":"2025"},{"key":"ref29","volume-title":"GGUF (pull request #2398, ggml-org\/llama.cpp). GitHub.","author":"Gerganov","year":"2023"},{"key":"ref30","article-title":"GGUF file-structure diagram (v3)","volume-title":"GitHub, 2024, diagram embedded in project documentation; author credited as @mishig25","author":"Davaadorj","year":"2025"},{"key":"ref31","volume-title":"K-quants (pull request: #1684, ggml-org\/llama.cpp). GitHub","author":"Kawrakow","year":"2023"},{"key":"ref32","volume-title":"Introduce k-bit group quantization formats (pull request #3360, ggml-org\/llama.cpp). GitHub","author":"Gerganov","year":"2024"},{"key":"ref33","volume-title":"Tensor encoding schemes (iq\/k quant bpw table). GitHub","year":"2024"},{"key":"ref34","volume-title":"Sota 2-bit quants (pull request: #4773, ggml-org\/llama.cpp). GitHub. Pull request introducing state-of-the-art 2-bit quantization (IQ2_XXS) to llama.cpp","author":"Kawrakow","year":"2024"},{"key":"ref35","article-title":"Sota 2-bit quants - part 2 (adds iq2_xs 2.31 bpw), (pull request#4856)","volume-title":"GitHub","author":"Kawrakow","year":"2024"},{"key":"ref36","article-title":"1.5 bit quantization (iql_s) (pull request #5453)","volume-title":"GitHub","author":"Kawrakow","year":"2024"},{"key":"ref37","article-title":"Iq3_s: a much better alternative to q3_k (pull request #5676)","volume-title":"GitHub","author":"Kawrakow","year":"2024"},{"key":"ref38","volume-title":"Importance matrix calculation (pull request #4861). GitHub","author":"Kawrakow","year":"2024"},{"key":"ref39","article-title":"Llama 2: Open foundation and fine-tuned chat models","volume-title":"arXiv preprint","author":"Touvron","year":"2023"},{"key":"ref40","article-title":"Quantization results table and formats (Q4_K_M, Q5_K_M, Q6_K, etc.)","volume-title":"GitHub","author":"Gerganov","year":"2025"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3530811"},{"key":"ref42","article-title":"A framework for few-shot language model evaluation","volume-title":"ACL 2023 Workshop on Benchmarking: Past, Present and Future","author":"Gao","year":"2023"},{"key":"ref43","article-title":"Measuring massive multitask language understanding","volume-title":"arXiv preprint","author":"Hendrycks","year":"2020"},{"key":"ref44","first-page":"4791","article-title":"HellaSwag: Can a machine really finish your sentence?","volume-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","author":"Zellers"},{"key":"ref45","first-page":"3214","article-title":"TruthfulQA: Measuring how models mimic human falsehoods","volume-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Lin"},{"key":"ref46","article-title":"Fast transformer decoding: One write-head is all you need","volume-title":"arXiv preprint","author":"Shazeer","year":"2019"},{"key":"ref47","article-title":"feat: add support for quantizing the k-cache (pull request #7412)","volume-title":"GitHub","author":"Gerganov","year":"2024"},{"key":"ref48","first-page":"4396","article-title":"Quip: 2-bit quantization of large language models with guarantees","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Chee","year":"2023"},{"key":"ref49","doi-asserted-by":"crossref","first-page":"12186","DOI":"10.18653\/v1\/2024.findings-acl.726","article-title":"A comprehensive evaluation of quantization strategies for large language models","volume-title":"Findings of the Association for Computational Linguistics ACL 2024","author":"Jin","year":"2024"},{"key":"ref50","article-title":"Introducing openai o3 and o4-mini","volume-title":"official OpenAI announcement and overview of the o3 and o4-mini reasoning models","year":"2025"},{"key":"ref51","volume-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2025"}],"event":{"name":"2025 IEEE 12th International Conference on Data Science and Advanced Analytics (DSAA)","location":"Birmingham, United Kingdom","start":{"date-parts":[[2025,10,9]]},"end":{"date-parts":[[2025,10,12]]}},"container-title":["2025 IEEE 12th International Conference on Data Science and Advanced Analytics (DSAA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11247920\/11247921\/11247985.pdf?arnumber=11247985","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,2]],"date-time":"2025-12-02T06:20:40Z","timestamp":1764656440000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11247985\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,9]]},"references-count":51,"URL":"https:\/\/doi.org\/10.1109\/dsaa65442.2025.11247985","relation":{},"subject":[],"published":{"date-parts":[[2025,10,9]]}}}