{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,31]],"date-time":"2025-08-31T08:10:01Z","timestamp":1756627801506,"version":"3.44.0"},"reference-count":106,"publisher":"Springer Science and Business Media LLC","issue":"25","license":[{"start":{"date-parts":[[2025,8,9]],"date-time":"2025-08-09T00:00:00Z","timestamp":1754697600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,9]],"date-time":"2025-08-09T00:00:00Z","timestamp":1754697600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1007\/s00521-025-11529-3","type":"journal-article","created":{"date-parts":[[2025,8,9]],"date-time":"2025-08-09T08:16:56Z","timestamp":1754727416000},"page":"20823-20878","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A survey on 1-bit quantized large language models"],"prefix":"10.1007","volume":"37","author":[{"given":"Kritika","family":"Tripathi","sequence":"first","affiliation":[]},{"given":"Devanshi","family":"Malik","sequence":"additional","affiliation":[]},{"given":"Abhi","family":"Akshat","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9492-7653","authenticated-orcid":false,"given":"Kusum","family":"Lata","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,9]]},"reference":[{"unstructured":"Naveed H, Khan AU, Qiu S et al (2023) A comprehensive overview of large language models","key":"11529_CR1"},{"unstructured":"Brants T, Popat AC, Xu P et al (2007) Large language models in machine translation. Association for Computational Linguistics","key":"11529_CR2"},{"unstructured":"Vaswani A, Shazeer N, Parmar N et al (2017) Attention is all you need","key":"11529_CR3"},{"unstructured":"Touvron H, Lavril T, Izacard G et al (2023) LLaMA: open and efficient foundation language models","key":"11529_CR4"},{"unstructured":"OpenAI, Achiam J, Adler S et al (2023) GPT-4 technical report","key":"11529_CR5"},{"unstructured":"Almazrouei E, Alobeidli H, Alshamsi A et al (2023) The Falcon series of open language models","key":"11529_CR6"},{"unstructured":"Workshop B, Scao T Le et al (2022) BLOOM: a 176B-parameter open-access multilingual language model","key":"11529_CR7"},{"unstructured":"Jiang AQ, Sablayrolles A, Mensch A et al (2023) Mistral 7B","key":"11529_CR8"},{"unstructured":"Ram\u00e9 A, Vieillard N, Hussenot L et al (2024) WARM: on the benefits of weight averaged reward models","key":"11529_CR9"},{"unstructured":"Shuster K, Xu J, Komeili M et al (2022) BlenderBot 3: a deployed conversational agent that continually learns to responsibly engage","key":"11529_CR10"},{"unstructured":"Team L, Meta A@ (2024) The Llama 3 herd of models","key":"11529_CR11"},{"unstructured":"Hinton G, Vinyals O, Dean J (2015) Distilling the knowledge in a neural network","key":"11529_CR12"},{"unstructured":"Xu X, Li M, Tao C et al (2024) A survey on knowledge distillation of large language models","key":"11529_CR13"},{"unstructured":"Xu Y, Han X, Yang Z et al (2024) OneBit: towards extremely low-bit large language models","key":"11529_CR14"},{"doi-asserted-by":"crossref","unstructured":"Zhao Z, Gan L, Wang G et al (2024) LoraRetriever: input-aware LoRA retrieval and composition for mixed tasks in the wild","key":"11529_CR15","DOI":"10.18653\/v1\/2024.findings-acl.263"},{"unstructured":"Dettmers T, Pagnoni A, Holtzman A, Zettlemoyer L (2023) QLoRA: efficient finetuning of quantized LLMs","key":"11529_CR16"},{"unstructured":"Xu Y, Xie L, Gu X et al (2023) QA-LoRA: quantization-aware low-rank adaptation of large language models","key":"11529_CR17"},{"doi-asserted-by":"crossref","unstructured":"Yang F, Zhao P, Wang Z et al (2023) Empower large language model to perform better on industrial domain-specific question answering","key":"11529_CR18","DOI":"10.18653\/v1\/2023.emnlp-industry.29"},{"unstructured":"Touvron H, Martin L, Stone K et al (2023) Llama 2: open foundation and fine-tuned chat models","key":"11529_CR19"},{"unstructured":"Liu H, Li C, Wu Q, Lee YJ (2023) Visual instruction tuning","key":"11529_CR20"},{"unstructured":"Chen B, Zhang Z, Langren\u00e9 N, Zhu S Unleashing the potential of prompt engineering in Large Language Models: a comprehensive review","key":"11529_CR21"},{"doi-asserted-by":"crossref","unstructured":"Yang L, Chen H, Li Z et al (2023) Give us the facts: enhancing large language models with knowledge graphs for fact-aware language modeling","key":"11529_CR22","DOI":"10.1109\/TKDE.2024.3360454"},{"unstructured":"Nori H, King N, McKinney SM et al (2023) Capabilities of GPT-4 on medical challenge problems","key":"11529_CR23"},{"unstructured":"Singhal K, Tu T, Gottweis J et al (2023) Towards expert-level medical question answering with large language models","key":"11529_CR24"},{"unstructured":"Liu Z, Huang Y, Yu X et al (2023) DeID-GPT: zero-shot medical text de-identification by GPT-4","key":"11529_CR25"},{"unstructured":"Wu S, Irsoy O, Lu S et al (2023) BloombergGPT: a large language model for finance","key":"11529_CR26"},{"unstructured":"Trautmann D, Petrova A, Schilder F (2022) Legal prompt engineering for multilingual legal judgement prediction","key":"11529_CR27"},{"doi-asserted-by":"crossref","unstructured":"Magar I, Schwartz R (2022) Data contamination: from memorization to exploitation. In: Proceedings of the annual meeting of the association for computational linguistics. Association for Computational Linguistics (ACL), pp 157\u2013165","key":"11529_CR28","DOI":"10.18653\/v1\/2022.acl-short.18"},{"unstructured":"Dan Y, Lei Z, Gu Y et al (2023) EduChat: a large-scale language model-based chatbot system for intelligent education","key":"11529_CR29"},{"unstructured":"Sohn SS, Li D, Zhang S et al (2024) From words to worlds: transforming one-line prompt into immersive multi-modal digital stories with communicative LLM agent","key":"11529_CR30"},{"unstructured":"Shi J, Li J, Ma Q et al (2024) CHOPS: CHat with custOmer profile systems for customer service with LLMs","key":"11529_CR31"},{"doi-asserted-by":"crossref","unstructured":"Xu Z, Cruz MJ, Guevara M et al (2024) Retrieval-augmented generation with knowledge graphs for customer service question answering. Association for Computing Machinery (ACM), pp 2905\u20132909","key":"11529_CR32","DOI":"10.1145\/3626772.3661370"},{"doi-asserted-by":"crossref","unstructured":"Zhang W, Deng Y, Liu B et al (2023) Sentiment analysis in the era of large language models: a reality check","key":"11529_CR33","DOI":"10.18653\/v1\/2024.findings-naacl.246"},{"unstructured":"Ma S, Wang H, Ma L et al (2024) The era of 1-bit LLMs: all large language models are in 1.58 bits","key":"11529_CR34"},{"unstructured":"Shang Y, Yuan Z, Wu Q, Dong Z (2023) PB-LLM: partially binarized large language models","key":"11529_CR35"},{"unstructured":"Frantar E, Ashkboos S, Hoefler T, Alistarh D (2022) GPTQ: accurate post-training quantization for generative pre-trained transformers","key":"11529_CR36"},{"unstructured":"Huang W, Liu Y, Qin H et al (2024) BiLLM: pushing the limit of post-training quantization for LLMs","key":"11529_CR37"},{"unstructured":"Shao W, Chen M, Zhang Z et al (2023) OmniQuant: omnidirectionally calibrated quantization for large language models","key":"11529_CR38"},{"unstructured":"Guo Y (2018) A survey on methods and theories of quantized neural networks","key":"11529_CR39"},{"doi-asserted-by":"crossref","unstructured":"Zuo X, Yang X, Dou Z, Wen JR (2019) RUCIR at TREC 2019: conversational assistance track. In: 28th text REtrieval conference, TREC 2019\u2014proceedings. National Institute of Standards and Technology (NIST)","key":"11529_CR40","DOI":"10.6028\/NIST.SP.1250.cast-RUCIR"},{"unstructured":"Wan Z, Wang X, Liu C et al (2023) Efficient large language models: a survey","key":"11529_CR41"},{"doi-asserted-by":"crossref","unstructured":"Williams M, Aletras N (2023) On the impact of calibration data in post-training quantization and pruning","key":"11529_CR42","DOI":"10.18653\/v1\/2024.acl-long.544"},{"doi-asserted-by":"crossref","unstructured":"Chitty-Venkata KT, Mittal S, Emani M et al (2023) A survey of techniques for optimizing transformer inference","key":"11529_CR43","DOI":"10.1016\/j.sysarc.2023.102990"},{"doi-asserted-by":"crossref","unstructured":"Gholami A, Kim S, Dong Z et al (2021) A survey of quantization methods for efficient neural network inference","key":"11529_CR44","DOI":"10.1201\/9781003162810-13"},{"key":"11529_CR45","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29765","author":"Z Gong","year":"2024","unstructured":"Gong Z, Liu J, Wang J et al (2024) What makes quantization for large language models hard? An empirical study from the lens of perturbation. Proc AAAI Conf Artif Intell. https:\/\/doi.org\/10.1609\/aaai.v38i16.29765","journal-title":"Proc AAAI Conf Artif Intell"},{"unstructured":"Li S, Ning X, Wang L et al (2024) Evaluating quantized large language models","key":"11529_CR46"},{"doi-asserted-by":"crossref","unstructured":"Zhu X, Li J, Liu Y et al (2023) A survey on model compression for large language models","key":"11529_CR47","DOI":"10.1162\/tacl_a_00704"},{"unstructured":"Devlin J, Chang M-W, Lee K et al BERT: pre-training of deep bidirectional transformers for language understanding","key":"11529_CR48"},{"unstructured":"McMahan HB, Ramage D, Talwar K, Zhang L (2017) Learning differentially private recurrent language models","key":"11529_CR49"},{"unstructured":"You Y, Li J, Reddi S et al (2019) Large batch optimization for deep learning: training BERT in 76 minutes","key":"11529_CR50"},{"unstructured":"Li Z, Wallace E, Shen S et al (2020) Train large, then compress: rethinking model size for efficient training and inference of transformers","key":"11529_CR51"},{"doi-asserted-by":"crossref","unstructured":"Rajbhandari S, Rasley J, Ruwase O, He Y (2019) ZeRO: memory optimizations toward training trillion parameter models","key":"11529_CR52","DOI":"10.1109\/SC41405.2020.00024"},{"unstructured":"Shi W, Ajith A, Xia M et al (2023) Detecting pretraining data from large language models","key":"11529_CR53"},{"unstructured":"Shoeybi M, Patwary M, Puri R et al (2019) Megatron-LM: training multi-billion parameter language models using model parallelism","key":"11529_CR54"},{"unstructured":"Hoffmann J, Borgeaud S, Mensch A et al (2022) Training compute-optimal large language models","key":"11529_CR55"},{"unstructured":"Kim S, Hooper C, Gholami A et al (2023) SqueezeLLM: dense-and-sparse quantization","key":"11529_CR56"},{"unstructured":"Brown TB, Mann B, Ryder N et al Language models are few-shot learners","key":"11529_CR57"},{"unstructured":"Kaplan J, McCandlish S, Henighan T et al (2020) Scaling laws for neural language models","key":"11529_CR58"},{"unstructured":"Thoppilan R, De Freitas D, Hall J et al (2022) LaMDA: language models for dialog applications","key":"11529_CR59"},{"unstructured":"Sun M, Liu Z, Bair A, Kolter JZ (2023) A simple and effective pruning approach for large language models","key":"11529_CR60"},{"unstructured":"Chee J, Cai Y, Kuleshov V, De Sa C (2023) QuIP: 2-bit quantization of large language models with guarantees","key":"11529_CR61"},{"unstructured":"Biderman S, Schoelkopf H, Anthony Q et al (2023) Pythia: a suite for analyzing large language models across training and scaling","key":"11529_CR62"},{"unstructured":"Gu Y, Dong L, Wei F, Huang M (2023) MiniLLM: knowledge distillation of large language models","key":"11529_CR63"},{"unstructured":"Peng B, Quesnelle J, Fan H, Shippole E (2023) YaRN: efficient context window extension of large language models","key":"11529_CR64"},{"unstructured":"Mukherjee S, Liu X, Zheng G et al CLUES: Few-shot learning evaluation in natural language understanding","key":"11529_CR65"},{"unstructured":"Gala J, Jayakumar T, Husain JA et al (2024) Airavata: introducing Hindi instruction-tuned LLM","key":"11529_CR66"},{"unstructured":"Parida S, Panwar S, Lata K et al (2024) Building pre-train LLM dataset for the INDIC languages: a case study on Hindi","key":"11529_CR67"},{"doi-asserted-by":"crossref","unstructured":"Khan MSUR, Mehta P, Sankar A et al (2024) IndicLLMSuite: a blueprint for creating pre-training and fine-tuning datasets for indian languages","key":"11529_CR68","DOI":"10.18653\/v1\/2024.acl-long.843"},{"unstructured":"Shen Z, Tao T, Ma L et al (2023) SlimPajama-DC: understanding data combinations for LLM training","key":"11529_CR69"},{"unstructured":"Glaese A, McAleese N, Tr\u0119bacz M et al (2022) Improving alignment of dialogue agents via targeted human judgements","key":"11529_CR70"},{"unstructured":"Xiao G, Lin J, Seznec M et al (2022) SmoothQuant: accurate and efficient post-training quantization for large language models","key":"11529_CR71"},{"doi-asserted-by":"crossref","unstructured":"Hu Z, Wang L, Lan Y et al (2023) LLM-adapters: an adapter family for parameter-efficient fine-tuning of large language models","key":"11529_CR72","DOI":"10.18653\/v1\/2023.emnlp-main.319"},{"doi-asserted-by":"crossref","unstructured":"Shao Z, Gong Y, Shen Y et al (2023) Enhancing retrieval-augmented large language models with iterative retrieval-generation synergy","key":"11529_CR73","DOI":"10.18653\/v1\/2023.findings-emnlp.620"},{"unstructured":"Wang Y, Ivison H, Dasigi P et al (2023) How far can camels go? Exploring the state of instruction tuning on open resources","key":"11529_CR74"},{"unstructured":"Sabane M, Litake O, Chadha A (2023) Breaking language barriers: a question answering dataset for Hindi and Marathi","key":"11529_CR75"},{"unstructured":"Tunstall L, Beeching E, Lambert N et al (2023) Zephyr: direct distillation of LM alignment","key":"11529_CR76"},{"unstructured":"Gao Y, Sheng T, Xiang Y et al (2023) Chat-REC: towards interactive and explainable LLMs-augmented recommender system","key":"11529_CR77"},{"unstructured":"Openai AR, Openai KN, Openai TS, Openai IS Improving language understanding by generative pre-training","key":"11529_CR78"},{"unstructured":"Li X, Tram\u00e8r F, Liang P, Hashimoto T (2021) Large language models can be strong differentially private learners","key":"11529_CR79"},{"unstructured":"Yao Z, Wu X, Li C et al (2023) ZeroQuant-V2: exploring post-training quantization in LLMs from comprehensive study to low rank compensation","key":"11529_CR80"},{"unstructured":"Kim S, Gholami A, Yao Z et al (2021) I-BERT: integer-only BERT quantization","key":"11529_CR81"},{"doi-asserted-by":"crossref","unstructured":"Lewis M, Liu Y, Goyal N et al (2019) BART: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension","key":"11529_CR82","DOI":"10.18653\/v1\/2020.acl-main.703"},{"unstructured":"Wei X, Gong R, Li Y et al (2022) QDrop: randomly dropping quantization for extremely low-bit post-training quantization","key":"11529_CR83"},{"key":"11529_CR84","doi-asserted-by":"publisher","first-page":"482","DOI":"10.1109\/89.326608","volume":"2","author":"L Wu","year":"1994","unstructured":"Wu L, Niranjan M, Fallside F (1994) Fully vector-quantized neural network-based code-excited nonlinear predictive speech coding. IEEE Trans Speech Audio Process 2:482\u2013489. https:\/\/doi.org\/10.1109\/89.326608","journal-title":"IEEE Trans Speech Audio Process"},{"doi-asserted-by":"crossref","unstructured":"Oualil Y, Klakow D (2017) A batch noise contrastive estimation approach for training large vocabulary language models","key":"11529_CR85","DOI":"10.21437\/Interspeech.2017-818"},{"unstructured":"Zhu K, Zhao Q, Chen H et al (2023) PromptBench: a unified library for evaluation of large language models","key":"11529_CR86"},{"unstructured":"Narayanan D, Phanishayee A, Shi K et al (2020) Memory-efficient pipeline-parallel DNN training","key":"11529_CR87"},{"unstructured":"Sheng Y, Zheng L, Yuan B et al (2023) FlexGen: high-throughput generative inference of large language models with a single GPU","key":"11529_CR88"},{"unstructured":"Vinyals O, Kaiser L, Koo T et al (2014) Grammar as a foreign language","key":"11529_CR89"},{"doi-asserted-by":"crossref","unstructured":"Petrov S, Barrett L, Thibaux R, Klein D (2006) Learning accurate, compact, and interpretable tree annotation","key":"11529_CR90","DOI":"10.3115\/1220175.1220230"},{"unstructured":"Zhu M, Zhang Y, Chen W et al Fast and accurate shift-reduce constituent parsing","key":"11529_CR91"},{"doi-asserted-by":"crossref","unstructured":"Dyer C, Kuncoro A, Ballesteros M, Smith NA (2016) Recurrent neural network grammars","key":"11529_CR92","DOI":"10.18653\/v1\/N16-1024"},{"doi-asserted-by":"crossref","unstructured":"Huang Z, Harper M (2009) Self-training PCFG grammars with latent annotations across languages","key":"11529_CR93","DOI":"10.3115\/1699571.1699621"},{"doi-asserted-by":"crossref","unstructured":"Mcclosky D, Charniak E, Johnson M (2006) Effective self-training for parsing","key":"11529_CR94","DOI":"10.3115\/1220835.1220855"},{"unstructured":"Luong M-T, Le QV, Sutskever I et al (2015) Multi-task sequence to sequence learning","key":"11529_CR95"},{"unstructured":"Wang H, Ma S, Dong L et al (2023) BitNet: scaling 1-bit transformers for large language models","key":"11529_CR96"},{"doi-asserted-by":"crossref","unstructured":"Chen M, Shao W, Xu P et al (2024) EfficientQAT: efficient quantization-aware training for large language models","key":"11529_CR97","DOI":"10.18653\/v1\/2025.acl-long.498"},{"unstructured":"Zhu R-J, Zhang Y, Sifferman E et al (2024) Scalable MatMul-free language modeling","key":"11529_CR98"},{"unstructured":"Kaushal A, Pandey T, Vaidhya T et al (2024) Spectra: a comprehensive study of ternary, quantized, and FP16 language models","key":"11529_CR99"},{"unstructured":"Frantar E, Alistarh D (2023) QMoE: practical sub-1-bit compression of trillion-parameter models","key":"11529_CR100"},{"unstructured":"Zhang T, Yi J, Xu Z, Shrivastava A (2024) KV cache is 1 Bit per channel: efficient large language model inference with coupled quantization","key":"11529_CR101"},{"doi-asserted-by":"crossref","unstructured":"Zandieh A, Daliri M, Han I (2024) QJL: 1-bit quantized JL transform for KV cache quantization with zero overhead","key":"11529_CR102","DOI":"10.1609\/aaai.v39i24.34773"},{"unstructured":"Jo D, Kim T, Kim Y, Kim J-J Mixture of scales: memory-efficient token-adaptive binarization for large language models","key":"11529_CR103"},{"unstructured":"Dong P, Li L, Du D et al (2024) STBLLM: breaking the 1-bit barrier with structured binary LLMs","key":"11529_CR104"},{"unstructured":"Malekar J, Elbtity ME, Zand R (2024) Matmul or no Matmul in the era of 1-bit LLMs","key":"11529_CR105"},{"doi-asserted-by":"crossref","unstructured":"Bal M, Jiang Y, Sengupta A (2024) Exploring extreme quantization in spiking language models","key":"11529_CR106","DOI":"10.1109\/ICONS62911.2024.00047"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-025-11529-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00521-025-11529-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-025-11529-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,31]],"date-time":"2025-08-31T07:31:41Z","timestamp":1756625501000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00521-025-11529-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,9]]},"references-count":106,"journal-issue":{"issue":"25","published-print":{"date-parts":[[2025,9]]}},"alternative-id":["11529"],"URL":"https:\/\/doi.org\/10.1007\/s00521-025-11529-3","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"type":"print","value":"0941-0643"},{"type":"electronic","value":"1433-3058"}],"subject":[],"published":{"date-parts":[[2025,8,9]]},"assertion":[{"value":"29 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}