{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T22:41:51Z","timestamp":1780612911134,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T00:00:00Z","timestamp":1781913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,21]]},"DOI":"10.1145\/3745756.3809200","type":"proceedings-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:52:21Z","timestamp":1780059141000},"page":"216-230","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Vec-LUT: Vector Table Lookup for Parallel Ultra-Low-Bit LLM Inference on Edge Devices"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-5341-2303","authenticated-orcid":false,"given":"Xiangyu","family":"Li","sequence":"first","affiliation":[{"name":"Institue for AI Industry Research (AIR), Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2167-3882","authenticated-orcid":false,"given":"Chengyu","family":"Yin","sequence":"additional","affiliation":[{"name":"Beijing Jiaotong University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9545-3322","authenticated-orcid":false,"given":"Weijun","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0830-044X","authenticated-orcid":false,"given":"Jianyu","family":"Wei","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9107-013X","authenticated-orcid":false,"given":"Ting","family":"Cao","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7352-8955","authenticated-orcid":false,"given":"Yunxin","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"1bitLLM. 2024. bitnet_b1_58-3B. https:\/\/huggingface.co\/1bitLLM\/bitnet_b1_58-3B. Reproduction of BitNet b1.58 paper trained on RedPajama dataset for 100B tokens."},{"key":"e_1_3_2_1_2_1","unstructured":"Apple Inc. 2025. Apple Intelligence gets even more powerful with new capabilities across Apple devices. https:\/\/www.apple.com\/newsroom\/2025\/06\/apple-intelligence-gets-even-more-powerful-with-new-capabilities-across-apple-devices\/. Press Release."},{"key":"e_1_3_2_1_3_1","volume-title":"Neon - Improve the Multimedia User Experience. Arm Technology Website. https:\/\/www.arm.com\/technologies\/neon Accessed","author":"Limited Arm","year":"2025","unstructured":"Arm Limited. 2025. Neon - Improve the Multimedia User Experience. Arm Technology Website. https:\/\/www.arm.com\/technologies\/neon Accessed May 8, 2025."},{"key":"e_1_3_2_1_4_1","unstructured":"Arm Limited. 2025. Neoverse V1: A Revolution in High Performance Computing. Arm Limited. https:\/\/www.arm.com\/products\/silicon-ip-cpu\/neoverse\/neoverse-v1"},{"key":"e_1_3_2_1_5_1","volume-title":"Accelerating Large Language Model Decoding with Speculative Sampling. arXiv preprint arXiv:2302.01318","author":"Chen Charlie","year":"2023","unstructured":"Charlie Chen, Sebastian Borgeaud, Geoffrey Irving, Jean-Baptiste Lespiau, Laurent Sifre, and John Jumper. 2023. Accelerating Large Language Model Decoding with Speculative Sampling. arXiv preprint arXiv:2302.01318 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Advances in Neural Information Processing Systems","volume":"38","author":"Chen Mouxiang","year":"2025","unstructured":"Mouxiang Chen, Binyuan Hui, Zeyu Cui, Jiaxi Yang, Dayiheng Liu, Jianling Sun, Junyang Lin, and Zhongxin Liu. 2025. Parallel Scaling Law for Language Models. In Advances in Neural Information Processing Systems, Vol. 38."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.498"},{"key":"e_1_3_2_1_8_1","volume-title":"TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578\u2013594."},{"key":"e_1_3_2_1_9_1","unstructured":"compilade. 2024. ggml-quants: ternary packing for TriLMs and BitNet b1.58. GitHub Pull Request #8151. https:\/\/github.com\/ggml-org\/llama.cpp\/pull\/8151"},{"key":"e_1_3_2_1_10_1","volume-title":"8-bit Matrix Multiplication for Transformers at Scale. Advances in neural information processing systems 35","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale. Advances in neural information processing systems 35 (2022), 30318\u201330332."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01249"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.7"},{"key":"e_1_3_2_1_13_1","volume-title":"CodeMonkeys: Scaling Test-Time Compute for Software Engineering. arXiv preprint arXiv:2501.14723","author":"Ehrlich Ryan","year":"2025","unstructured":"Ryan Ehrlich, Bradley Brown, Jordan Juravsky, Ronald Clark, Christopher R\u00e9, and Azalia Mirhoseini. 2025. CodeMonkeys: Scaling Test-Time Compute for Software Engineering. arXiv preprint arXiv:2501.14723 (2025)."},{"key":"e_1_3_2_1_14_1","unstructured":"Falcon-LLM Team. 2024. The Falcon 3 Family of Open Models. https:\/\/huggingface.co\/blog\/falcon3."},{"key":"e_1_3_2_1_15_1","volume-title":"GPTQ: Accurate Post-training Quantization for Generative Pretrained Transformers. In International Conference on Learning Representations.","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2023. GPTQ: Accurate Post-training Quantization for Generative Pretrained Transformers. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_16_1","unstructured":"ggml-org. 2025. llama.cpp: LLM inference in C\/C++. https:\/\/github.com\/ggml-org\/llama.cpp"},{"key":"e_1_3_2_1_17_1","volume-title":"Tenet: An Efficient Sparsity-Aware LUT-Centric Architecture for Ternary LLM Inference on Edge. arXiv preprint arXiv:2509.13765","author":"Huang Zhirui","year":"2025","unstructured":"Zhirui Huang, Rui Ma, Shijie Cao, Ran Shu, Ian Wang, Ting Cao, Chixiao Chen, and Yongqiang Xiong. 2025. Tenet: An Efficient Sparsity-Aware LUT-Centric Architecture for Ternary LLM Inference on Edge. arXiv preprint arXiv:2509.13765 (2025)."},{"key":"e_1_3_2_1_18_1","volume-title":"Gemma 3 on mobile and web with Google AI Edge. Google Developers Blog. https:\/\/developers.googleblog.com\/en\/gemma-3-on-mobile-and-web-with-google-ai-edge\/ Accessed","author":"Ikonomidis Marissa","year":"2025","unstructured":"Marissa Ikonomidis, T.J. Alumbaugh, Mark Sherwood, and Cormac Brick. 2025. Gemma 3 on mobile and web with Google AI Edge. Google Developers Blog. https:\/\/developers.googleblog.com\/en\/gemma-3-on-mobile-and-web-with-google-ai-edge\/ Accessed December 5, 2025."},{"key":"e_1_3_2_1_19_1","volume-title":"Intel\u00ae Core\u2122 i7-13700K Processor","author":"Intel Corporation","unstructured":"Intel Corporation. 2022. Intel\u00ae Core\u2122 i7-13700K Processor. Intel Corporation. https:\/\/www.intel.com\/content\/www\/us\/en\/products\/sku\/230500\/intel-core-i713700k-processor-30m-cache-up-to-5-40-ghz\/specifications.html"},{"key":"e_1_3_2_1_20_1","volume-title":"Intrinsics for Intel\u00ae Advanced Vector Extensions 2 (Intel\u00ae AVX2)","author":"Intel Corporation","year":"2021","unstructured":"Intel Corporation. 2022. Intrinsics for Intel\u00ae Advanced Vector Extensions 2 (Intel\u00ae AVX2). Intel Corporation. https:\/\/www.intel.com\/content\/www\/us\/en\/docs\/cpp-compiler\/developer-guide-reference\/2021-8\/intrinsics-for-avx2.html Intel\u00ae C++ Compiler Classic Developer Guide and Reference, Version 2021.10. Accessed May 8, 2025."},{"key":"e_1_3_2_1_21_1","volume-title":"Fix Performance Bottlenecks with Intel\u00ae VTune\u2122 Profiler. Intel Developer Website. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/tools\/oneapi\/vtune-profiler.html Accessed","author":"Intel Corporation","year":"2025","unstructured":"Intel Corporation. 2025. Fix Performance Bottlenecks with Intel\u00ae VTune\u2122 Profiler. Intel Developer Website. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/tools\/oneapi\/vtune-profiler.html Accessed May 8, 2025."},{"key":"e_1_3_2_1_22_1","volume-title":"International Conference on Learning Representations. 40478\u201340525","author":"Kaushal Ayush","year":"2025","unstructured":"Ayush Kaushal, Tejas Vaidhya, Arnab Mondal, Tejas Pandey, Aaryan Bhagat, and Irina Rish. 2025. Surprising Effectiveness of Pretraining Ternary Language Model at Scale. In International Conference on Learning Representations. 40478\u201340525."},{"key":"e_1_3_2_1_23_1","volume-title":"Microsoft Tech Community. https:\/\/techcommunity.microsoft.com\/blog\/azuredevcommunityblog\/getting-started-generative-ai-with-phi-3-mini-running-phi-3-mini-in-intel-ai-p\/4147246 Updated","year":"2024","unstructured":"kinfey. 2024. Getting Started - Generative AI with Phi-3-mini: Running Phi-3-mini in Intel AI PC. Microsoft Developer Community Blog, Microsoft Tech Community. https:\/\/techcommunity.microsoft.com\/blog\/azuredevcommunityblog\/getting-started-generative-ai-with-phi-3-mini-running-phi-3-mini-in-intel-ai-p\/4147246 Updated May 22, 2024, Version 4.0. Accessed December 5, 2025."},{"key":"e_1_3_2_1_24_1","volume-title":"International Conference on Machine Learning","volume":"202","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. 2023. Fast Inference from Transformers via Speculative Decoding. In International Conference on Machine Learning, Vol. 202. PMLR, 19274\u201319286."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1140"},{"key":"e_1_3_2_1_26_1","first-page":"87","article-title":"AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration","volume":"6","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. 2024. AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration. In Proceedings of Machine Learning and Systems, Vol. 6. 87\u2013100.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_27_1","unstructured":"Zechun Liu Changsheng Zhao Hanxian Huang Sijia Chen Jing Zhang Jiawei Zhao Scott Roy Lisa Jin Yunyang Xiong Yangyang Shi et al. 2025. ParetoQ: Improving Scaling Laws in Extremely Low-Bit LLM Quantization. arXiv preprint arXiv:2502.02631 (2025)."},{"key":"e_1_3_2_1_28_1","unstructured":"Shuming Ma Hongyu Wang Shaohan Huang Xingxing Zhang Ying Hu Ting Song Yan Xia and Furu Wei. 2025. BitNet b1.58 2B4T Technical Report. arXiv preprint arXiv:2504.12285 (2025)."},{"key":"e_1_3_2_1_29_1","volume-title":"The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits. arXiv preprint arXiv:2402.17764","author":"Ma Shuming","year":"2024","unstructured":"Shuming Ma, Hongyu Wang, Lingxiao Ma, Lei Wang, Wenhui Wang, Shaohan Huang, Lifeng Dong, Ruiping Wang, Jilong Xue, and Furu Wei. 2024. The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits. arXiv preprint arXiv:2402.17764 (2024)."},{"key":"e_1_3_2_1_30_1","volume-title":"The Official Microsoft Blog. https:\/\/blogs.microsoft.com\/blog\/2024\/05\/20\/introducing-copilot-pcs\/ Accessed","author":"Mehdi Yusuf","year":"2025","unstructured":"Yusuf Mehdi. 2024. Introducing Copilot+ PCs. The Official Microsoft Blog. https:\/\/blogs.microsoft.com\/blog\/2024\/05\/20\/introducing-copilot-pcs\/ Accessed May 8, 2025."},{"key":"e_1_3_2_1_31_1","unstructured":"Mohamed Mekkouri Marc Sun Leandro von Werra Pedro Cuenca Omar Sanseviero and Thomas Wolf. 2024. Fine-tuning LLMs to 1.58bit: Extreme Quantization Made Easy. https:\/\/huggingface.co\/blog\/1_58_llm_extreme_quantization."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731057"},{"key":"e_1_3_2_1_33_1","unstructured":"NVIDIA Corporation. 2020. NVIDIA A100 Tensor Core GPU Architecture. Technical Report. NVIDIA Corporation. https:\/\/images.nvidia.cn\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf Accessed May 8 2025."},{"key":"e_1_3_2_1_34_1","volume-title":"T-SAR: A Full-Stack Co-design for CPU-Only Ternary LLM Inference via In-Place SIMD ALU Reorganization. arXiv preprint arXiv:2511.13676","author":"Oh Hyunwoo","year":"2025","unstructured":"Hyunwoo Oh, KyungIn Nam, Rajat Bhattacharjya, Hanning Chen, Tamoghno Das, Sanggeon Yun, Suyeon Jang, Andrew Ding, Nikil Dutt, and Mohsen Imani. 2025. T-SAR: A Full-Stack Co-design for CPU-Only Ternary LLM Inference via In-Place SIMD ALU Reorganization. arXiv preprint arXiv:2511.13676 (2025)."},{"key":"e_1_3_2_1_35_1","volume-title":"International Conference on Learning Representations.","author":"Park Gunho","year":"2024","unstructured":"Gunho Park, Baeseong Park, Minsub Kim, Sungjae Lee, Jeonghoon Kim, Beomseok Kwon, Se Jung Kwon, Byeongwook Kim, Youngjoo Lee, and Dongsoo Lee. 2024. LUT-GEMM: Quantized Matrix Multiplication based on LUTs for Efficient Inference in Large-Scale Generative Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_36_1","unstructured":"Qualcomm Technologies Inc. 2024. Unlocking on-device generative AI with an NPU and heterogeneous computing. Technical Report. Qualcomm Technologies Inc. https:\/\/www.qualcomm.com\/content\/dam\/qcomm-martech\/dm-assets\/documents\/Unlocking-on-device-generative-AI-with-an-NPU-and-heterogeneous-computing.pdf Accessed May 8 2025."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680207.3723486"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711875.3729141"},{"key":"e_1_3_2_1_39_1","volume-title":"BitNet: Scaling 1-bit Transformers for Large Language Models. arXiv preprint arXiv:2310.11453","author":"Wang Hongyu","year":"2023","unstructured":"Hongyu Wang, Shuming Ma, Li Dong, Shaohan Huang, Huaijie Wang, Lingxiao Ma, Fan Yang, Ruiping Wang, Yi Wu, and Furu Wei. 2023. BitNet: Scaling 1-bit Transformers for Large Language Models. arXiv preprint arXiv:2310.11453 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.457"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696099"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649379"},{"key":"e_1_3_2_1_43_1","volume-title":"International Conference on Machine Learning. PMLR, 38087\u201338099","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning. PMLR, 38087\u201338099."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649361"}],"event":{"name":"MobiSys '26: 24th Annual International Conference on Mobile Systems, Applications and Services","location":"University of Cambridge Cambridge United Kingdom","acronym":"MobiSys '26","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 24th Annual International Conference on Mobile Systems, Applications and Services"],"original-title":[],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:53:10Z","timestamp":1780059190000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3745756.3809200"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,20]]},"references-count":44,"alternative-id":["10.1145\/3745756.3809200","10.1145\/3745756"],"URL":"https:\/\/doi.org\/10.1145\/3745756.3809200","relation":{},"subject":[],"published":{"date-parts":[[2026,6,20]]},"assertion":[{"value":"2026-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}