{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T21:30:34Z","timestamp":1773955834851,"version":"3.50.1"},"reference-count":55,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,26]],"date-time":"2025-10-26T00:00:00Z","timestamp":1761436800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,26]],"date-time":"2025-10-26T00:00:00Z","timestamp":1761436800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,26]]},"DOI":"10.1109\/iccad66269.2025.11240685","type":"proceedings-article","created":{"date-parts":[[2025,11,20]],"date-time":"2025-11-20T18:39:34Z","timestamp":1763663974000},"page":"1-9","source":"Crossref","is-referenced-by-count":1,"title":["Squat: Quant Small Language Models on the Edge"],"prefix":"10.1109","author":[{"given":"Xuan","family":"Shen","sequence":"first","affiliation":[{"name":"Northeastern University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Peiyan","family":"Dong","sequence":"additional","affiliation":[{"name":"MIT"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenglun","family":"Kong","sequence":"additional","affiliation":[{"name":"Harvard University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifan","family":"Gong","sequence":"additional","affiliation":[{"name":"Northeastern University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Changdi","family":"Yang","sequence":"additional","affiliation":[{"name":"Northeastern University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhaoyang","family":"Han","sequence":"additional","affiliation":[{"name":"Northeastern University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanyue","family":"Xie","sequence":"additional","affiliation":[{"name":"Northeastern University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lei","family":"Lu","sequence":"additional","affiliation":[{"name":"Northeastern University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cheng","family":"Lyu","sequence":"additional","affiliation":[{"name":"Northeastern University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chao","family":"Wu","sequence":"additional","affiliation":[{"name":"Northeastern University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanzhi","family":"Wang","sequence":"additional","affiliation":[{"name":"Northeastern University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pu","family":"Zhao","sequence":"additional","affiliation":[{"name":"Northeastern University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Smollm2: When smol goes big\u2013data-centric training of a small language model","author":"Allal","year":"2025"},{"key":"ref2","article-title":"Accelerating deep learning model inference on arm cpus with ultra-low bit quantization and runtime","author":"Ashfaq","year":"2022"},{"key":"ref3","author":"Bengio","year":"2013","journal-title":"Estimating or propagating gradients through stochastic neurons for conditional computation"},{"key":"ref4","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"NeurIPS"},{"key":"ref5","author":"Brown","year":"2020","journal-title":"Language models are few-shot learners"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.498"},{"key":"ref7","author":"Chen","year":"2024","journal-title":"Octopus v2: On-device language model for super agent"},{"key":"ref8","author":"Choi","year":"2018","journal-title":"Pact: Parameterized clipping activation for quantized neural networks"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071047"},{"key":"ref10","author":"Dukhan","year":"2018","journal-title":"Qnnpack: Open source library for optimized mobile deep learning"},{"key":"ref11","author":"Frantar","year":"2022","journal-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers"},{"key":"ref12","author":"Hinton","year":"2015","journal-title":"Distilling the knowledge in a neural network"},{"key":"ref13","author":"Hu","year":"2024","journal-title":"Minicpm: Unveiling the potential of small language models with scalable training strategies"},{"key":"ref14","article-title":"gemmlowp: A small self-contained low-precision gemm library","volume":"14","author":"Jacob","year":"2017","journal-title":"Retrieved June"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00141"},{"key":"ref16","article-title":"Token-scaled logit distillation for ternary weight generative language models","volume":"36","author":"Kim","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref17","author":"Kim","year":"2023","journal-title":"Squeezellm: Dense-and-sparse quantization"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20083-0_37"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/449"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3714983.3714987"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.26"},{"key":"ref22","article-title":"Mobilellm: Optimizing sub-billion parameter language models for on-device use cases","author":"Liu","year":"2024"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1971.1054681"},{"key":"ref24","author":"Park","year":"2022","journal-title":"Nipq: Noise injection pseudo quantization for automated dnn optimization"},{"key":"ref25","article-title":"Bibert: Accurate fully binarized bert","volume-title":"ICLR","author":"Qin"},{"issue":"8","key":"ref26","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref27","author":"Radford","year":"2019","journal-title":"Language models are unsupervised multitask learners"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29860"},{"key":"ref29","author":"Shen","year":"2025","journal-title":"Draftattention: Fast video diffusion via low-resolution attention guidance"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2024.3487781"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01069"},{"key":"ref32","article-title":"Fastcar: Cache attentive replay for fast auto-regressive video generation on the edge","author":"Shen","year":"2025"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i19.34248"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i19.34249"},{"key":"ref35","article-title":"Search for efficient large language models","author":"Shen","year":"2024","journal-title":"NeurIPS"},{"key":"ref36","article-title":"Sparse learning for state space models on mobile","volume-title":"ICLR","author":"Shen"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.331"},{"key":"ref38","author":"Tastet","year":"2024","journal-title":"Babyllama-2: Ensemble-distilled models consistently outperform teachers with limited data"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.conll-babylm.24"},{"key":"ref40","author":"Touvron","year":"2023","journal-title":"Llama: Open and efficient foundation language models"},{"key":"ref41","article-title":"A survey of small language models","author":"Van Nguyen","year":"2024"},{"key":"ref42","article-title":"Superglue: A stickier benchmark for general-purpose language understanding systems","volume":"32","author":"Wang","year":"2019","journal-title":"NeurIPS"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00321"},{"key":"ref45","author":"Wu","year":"2023","journal-title":"Zeroquant-fp: A leap forward in llms post-training w4a8 quantization using floating-point formats"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19800-7_6"},{"key":"ref47","first-page":"38087","article-title":"Smoothquant: Accurate and efficient post-training quantization for large language models","volume-title":"ICML","author":"Xiao"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01478"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.conll-babylm.7"},{"key":"ref50","first-page":"27168","article-title":"Zeroquant: Efficient and affordable post-training quantization for large-scale transformers","volume":"35","author":"Yao","year":"2022","journal-title":"NeurIPS"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00478"},{"key":"ref52","article-title":"Fast and memory-efficient video diffusion using streamlined inference","author":"Zhan","year":"2024","journal-title":"NeurIPS"},{"key":"ref53","article-title":"Opt: Open pre-trained transformer language models","author":"Zhang","year":"2022"},{"key":"ref54","article-title":"Fully open source moxin-7b technical report","author":"Zhao","year":"2024"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.566"}],"event":{"name":"2025 IEEE\/ACM International Conference On Computer Aided Design (ICCAD)","location":"Munich, Germany","start":{"date-parts":[[2025,10,26]]},"end":{"date-parts":[[2025,10,30]]}},"container-title":["2025 IEEE\/ACM International Conference On Computer Aided Design (ICCAD)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11240608\/11240621\/11240685.pdf?arnumber=11240685","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T05:43:01Z","timestamp":1763703781000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11240685\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,26]]},"references-count":55,"URL":"https:\/\/doi.org\/10.1109\/iccad66269.2025.11240685","relation":{},"subject":[],"published":{"date-parts":[[2025,10,26]]}}}