{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T14:53:57Z","timestamp":1781794437436,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":27,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,22]],"date-time":"2026-06-22T00:00:00Z","timestamp":1782086400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"the Jiangsu Province Major Scientific Project","award":["BG2024032"],"award-info":[{"award-number":["BG2024032"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,22]]},"DOI":"10.1145\/3787109.3815208","type":"proceedings-article","created":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T14:17:19Z","timestamp":1781792239000},"page":"109-114","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["HoloLUT: An Efficient LUT-Based Engine via Holistic Data Processing for Low-bit LLM Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-8690-3608","authenticated-orcid":false,"given":"Hui","family":"Wang","sequence":"first","affiliation":[{"name":"Nanjing University, Nanjing, Jiangsu, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2365-5284","authenticated-orcid":false,"given":"Weize","family":"Ma","sequence":"additional","affiliation":[{"name":"Nanjing University, Nanjing, Jiangsu, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0192-3703","authenticated-orcid":false,"given":"Jinming","family":"Lu","sequence":"additional","affiliation":[{"name":"Nanjing University, Suzhou, Jiangsu, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3505-4847","authenticated-orcid":false,"given":"Jun","family":"Lin","sequence":"additional","affiliation":[{"name":"Nanjing University, Nanjing, Jiangsu, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,22]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Rajeev Balasubramonian Andrew\u00a0B. Kahng Naveen Muralimanohar Ali Shafiee and Vaishnav Srinivas. 2017. CACTI 7: New Tools for Interconnect Exploration in Innovative Off-Chip Memories. ACM Trans. Archit. Code Optim. 14 2 (June 2017).","DOI":"10.1145\/3085572"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.498"},{"key":"e_1_3_3_1_4_2","first-page":"30318","volume-title":"Advances in Neural Information Processing Systems","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. GPT3.int8(): 8-bit matrix multiplication for transformers at scale. In Advances in Neural Information Processing Systems , Vol.\u00a035. 30318\u201330332."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00110"},{"key":"e_1_3_3_1_6_2","volume-title":"GPTQ: Accurate post-training quantization for generative pre-trained transformers","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. GPTQ: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.17323."},{"key":"e_1_3_3_1_7_2","volume-title":"The Eleventh International Conference on Learning Representations (ICLR\u201923)","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2023. OPTQ: Accurate Quantization for Generative Pre-trained Transformers. In The Eleventh International Conference on Learning Representations (ICLR\u201923)."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00095"},{"key":"e_1_3_3_1_9_2","series-title":"(ICML\u201924)","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Huang Wei","year":"2024","unstructured":"Wei Huang, Yangdong Liu, Haotong Qin, Ying Li, Shiming Zhang, Xianglong Liu, Michele Magno, and Xiaojuan Qi. 2024. BiLLM: pushing the limit of post-training quantization for LLMs. In Proceedings of the 41st International Conference on Machine Learning(ICML\u201924). Article 806, 20\u00a0pages."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00064"},{"key":"e_1_3_3_1_11_2","volume-title":"Scaling laws for neural language models","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom\u00a0B. Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. 2020. Scaling laws for neural language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2001.08361."},{"key":"e_1_3_3_1_12_2","volume-title":"SqueezeLLM: Dense-and-Sparse quantization","author":"Kim Sehoon","year":"2023","unstructured":"Sehoon Kim, Coleman Hooper, Amir Gholami, Zhen Dong, Xiuyu Li, Sheng Shen, Michael\u00a0W. Mahoney, and Kurt Keutzer. 2023. SqueezeLLM: Dense-and-Sparse quantization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.07629"},{"key":"e_1_3_3_1_13_2","volume-title":"The Eleventh International Conference on Learning Representations (ICLR\u201923)","author":"Kim Yulhwa","year":"2023","unstructured":"Yulhwa Kim, Jaeyong Jang, Jehun Lee, Jihoon Park, Jeonghoon Kim, Byeongwook Kim, Baeseong Park, Se\u00a0Jung Kwon, Dongsoo Lee, and Jae-Joon Kim. 2023. Winning Both the Accuracy of Floating Point Activation and the Simplicity of Integer Arithmetic. In The Eleventh International Conference on Learning Representations (ICLR\u201923)."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Joo-Young Lee Changhyeon Kim Sanghoon Kang Dongjoo Shin Seungyeob Kim and Hoi-Jun Yoo. 2019. UNPU: An Energy-Efficient Deep Neural Network Accelerator With Fully Variable Weight Bit Precision. IEEE Journal of Solid-State Circuits 54 1 (2019) 173\u2013185.","DOI":"10.1109\/JSSC.2018.2865489"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Shang Li Zhiyuan Yang Dhiraj Reddy Ankur Srivastava and Bruce Jacob. 2020. DRAMsim3: A Cycle-Accurate Thermal-Capable DRAM Simulator. IEEE Computer Architecture Letters 19 2 (2020) 106\u2013109.","DOI":"10.1109\/LCA.2020.2973991"},{"key":"e_1_3_3_1_16_2","volume-title":"The Thirteenth International Conference on Learning Representations (ICLR\u201925)","author":"Li Zhiteng","year":"2025","unstructured":"Zhiteng Li, Xianglong Yan, Tianao Zhang, Haotong Qin, Dong Xie, Jiang Tian, Zhongchao Shi, Linghe Kong, Yulun Zhang, and Xiaokang Yang. 2025. ARB-LLM: Alternating Refined Binarizations for Large Language Models. In The Thirteenth International Conference on Learning Representations (ICLR\u201925)."},{"key":"e_1_3_3_1_17_2","first-page":"87","volume-title":"Proceedings of Machine Learning and Systems","volume":"6","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Guangxuan Xiao, and Song Han. 2024. AWQ: Activation-aware weight quantization for on-device LLM compression and acceleration. In Proceedings of Machine Learning and Systems , Vol.\u00a06. 87\u2013100."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.26"},{"key":"e_1_3_3_1_19_2","first-page":"92009","volume-title":"The Thirteenth International Conference on Learning Representations (ICLR\u201925)","volume":"2025","author":"Liu Zechun","year":"2025","unstructured":"Zechun Liu, Changsheng Zhao, Igor Fedorov, Bilge Soran, Dhruv Choudhary, Raghuraman Krishnamoorthi, Vikas Chandra, Yuandong Tian, and Tijmen Blankevoort. 2025. SpinQuant: LLM Quantization with Learned Rotations. In The Thirteenth International Conference on Learning Representations (ICLR\u201925) , Vol.\u00a02025. 92009\u201392032."},{"key":"e_1_3_3_1_20_2","volume-title":"ParetoQ: Scaling Laws in Extremely Low-bit LLM Quantization","author":"Liu Zechun","year":"2025","unstructured":"Zechun Liu, Changsheng Zhao, Hanxian Huang, Sijia Chen, Jing Zhang, Jiawei Zhao, Scott Roy, Lisa Jin, Yunyang Xiong, Yangyang Shi, et\u00a0al. 2025. ParetoQ: Scaling Laws in Extremely Low-bit LLM Quantization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.02631"},{"key":"e_1_3_3_1_21_2","volume-title":"The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits","author":"Ma Shuming","year":"2024","unstructured":"Shuming Ma, Hongyu Wang, Lingxiao Ma, Lei Wang, Wenhui Wang, Shaohan Huang, Li Dong, Ruiping Wang, Jilong Xue, and Furu Wei. 2024. The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.17764"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731057"},{"key":"e_1_3_3_1_23_2","unstructured":"NVIDIA. 2025. CUTLASS: CUDA Templates for Linear Algebra Subroutines. https:\/\/github.com\/NVIDIA\/cutlass."},{"key":"e_1_3_3_1_24_2","volume-title":"The Twelfth International Conference on Learning Representations (ICLR\u201924)","author":"Park Gunho","year":"2024","unstructured":"Gunho Park, Baeseong Park, Minsub Kim, Sungjae Lee, Jeonghoon Kim, Beomseok Kwon, Se\u00a0Jung Kwon, Byeongwook Kim, Youngjoo Lee, and Dongsoo Lee. 2024. LUT-GEMM: Quantized Matrix Multiplication based on LUTs for Efficient Inference in Large-Scale Generative Language Models. In The Twelfth International Conference on Learning Representations (ICLR\u201924)."},{"key":"e_1_3_3_1_25_2","volume-title":"LLaMA: Open and Efficient Foundation Language Models","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971."},{"key":"e_1_3_3_1_26_2","volume-title":"BitNet: Scaling 1-bit Transformers for Large Language Models","author":"Wang Hongyu","year":"2023","unstructured":"Hongyu Wang, Shuming Ma, Li Dong, Shaohan Huang, Huaijie Wang, Lingxiao Ma, Fan Yang, Ruiping Wang, Yi Wu, and Furu Wei. 2023. BitNet: Scaling 1-bit Transformers for Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.11453."},{"key":"e_1_3_3_1_27_2","volume-title":"T-MAC: CPU Renaissance via Table Lookup for Low-Bit LLM Deployment on Edge","author":"Wei Jianyu","year":"2024","unstructured":"Jianyu Wei, Shijie Cao, Ting Cao, Lingxiao Ma, Lei Wang, Yanyong Zhang, and Mao Yang. 2024. T-MAC: CPU Renaissance via Table Lookup for Low-Bit LLM Deployment on Edge. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.00088."},{"key":"e_1_3_3_1_28_2","volume-title":"OPT: Open Pre-trained Transformer Language Models","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi\u00a0Victoria Lin, Todor Mihaylov, Myle Ott, Shikhar Shleifer, Kurt Shuster, Daniel Simig, Punit\u00a0Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2205.01068."}],"event":{"name":"GLSVLSI '26: Great Lakes Symposium on VLSI 2026","location":"Canandaigua , NY , USA","acronym":"GLSVLSI '26","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE CEDA"]},"container-title":["Proceedings of the Great Lakes Symposium on VLSI 2026"],"original-title":[],"deposited":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T14:19:03Z","timestamp":1781792343000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3787109.3815208"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,22]]},"references-count":27,"alternative-id":["10.1145\/3787109.3815208","10.1145\/3787109"],"URL":"https:\/\/doi.org\/10.1145\/3787109.3815208","relation":{},"subject":[],"published":{"date-parts":[[2026,6,22]]},"assertion":[{"value":"2026-06-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}