{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T23:12:43Z","timestamp":1775862763790,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3689031.3696099","type":"proceedings-article","created":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T06:25:20Z","timestamp":1742970320000},"page":"278-292","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["T-MAC: CPU Renaissance via Table Lookup for Low-Bit LLM Deployment on Edge"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-0830-044X","authenticated-orcid":false,"given":"Jianyu","family":"Wei","sequence":"first","affiliation":[{"name":"USTC \/ Microsoft Research"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2001-3763","authenticated-orcid":false,"given":"Shijie","family":"Cao","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9107-013X","authenticated-orcid":false,"given":"Ting","family":"Cao","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9524-5476","authenticated-orcid":false,"given":"Lingxiao","family":"Ma","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2313-5348","authenticated-orcid":false,"given":"Lei","family":"Wang","sequence":"additional","affiliation":[{"name":"UCAS \/ Microsoft Research"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9046-798X","authenticated-orcid":false,"given":"Yanyong","family":"Zhang","sequence":"additional","affiliation":[{"name":"USTC"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6455-3898","authenticated-orcid":false,"given":"Mao","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"DLPack. https:\/\/github.com\/dmlc\/dlpack."},{"key":"e_1_3_2_1_2_1","unstructured":"Intel Neural Compressor. https:\/\/github.com\/intel\/neural-compressor."},{"key":"e_1_3_2_1_3_1","unstructured":"Introducing Copilot+ PCs. https:\/\/blogs.microsoft.com\/blog\/2024\/05\/20\/introducing-copilot-pcs\/#_ftn2."},{"key":"e_1_3_2_1_4_1","unstructured":"Llama-2-7B GGUF Models. https:\/\/huggingface.co\/TheBloke\/Llama-2-7B-GGUF."},{"key":"e_1_3_2_1_5_1","unstructured":"llama.cpp. https:\/\/github.com\/ggerganov\/llama.cpp."},{"key":"e_1_3_2_1_6_1","unstructured":"llama.cpp perplexity. https:\/\/github.com\/ggerganov\/llama.cpp\/blob\/master\/examples\/perplexity\/perplexity.cpp."},{"key":"e_1_3_2_1_7_1","unstructured":"MICROSOFT BitBLAS. https:\/\/github.com\/microsoft\/bitblas."},{"key":"e_1_3_2_1_8_1","unstructured":"NVIDIA CUTLASS. https:\/\/github.com\/NVIDIA\/cutlass."},{"key":"e_1_3_2_1_9_1","unstructured":"NVIDIA TensorRT-LLM. https:\/\/github.com\/NVIDIA\/TensorRT-LLM."},{"key":"e_1_3_2_1_10_1","unstructured":"Qualcomm AI Hub. https:\/\/aihub.qualcomm.com."},{"key":"e_1_3_2_1_11_1","volume-title":"An adversarial winograd schema challenge at scale","author":"Winogrande","year":"2019","unstructured":"Winogrande: An adversarial winograd schema challenge at scale. 2019."},{"key":"e_1_3_2_1_12_1","unstructured":"Marah Abdin Sam Ade Jacobs Ammar Ahmad Awan Jyoti Aneja Ahmed Awadallah Hany Awadalla Nguyen Bach Amit Bahree Arash Bakhtiari Harkirat Behl Alon Benhaim Misha Bilenko Johan Bjorck S\u00e9bastien Bubeck Martin Cai Caio C\u00e9sar Teodoro Mendes Weizhu Chen Vishrav Chaudhary Parul Chopra Allie Del Giorno Gustavo de Rosa Matthew Dixon Ronen Eldan Dan Iter Amit Garg Abhishek Goswami Suriya Gunasekar Emman Haider Junheng Hao Russell J. Hewett Jamie Huynh Mojan Javaheripi Xin Jin Piero Kauffmann Nikos Karampatziakis Dongwoo Kim Mahoud Khademi Lev Kurilenko James R. Lee Yin Tat Lee Yuanzhi Li Chen Liang Weishung Liu Eric Lin Zeqi Lin Piyush Madan Arindam Mitra Hardik Modi Anh Nguyen Brandon Norick Barun Patra Daniel Perez-Becker Thomas Portet Reid Pryzant Heyang Qin Marko Radmilac Corby Rosset Sambudha Roy Olatunji Ruwase Olli Saarikivi Amin Saied Adil Salim Michael Santacroce Shital Shah Ning Shang Hiteshi Sharma Xia Song Masahiro Tanaka Xin Wang Rachel Ward Guanhua Wang Philipp Witte Michael Wyatt Can Xu Jiahang Xu Sonali Yadav Fan Yang Ziyi Yang Donghan Yu Chengruidong Zhang Cyril Zhang Jianwen Zhang Li Lyna Zhang Yi Zhang Yue Zhang Yunan Zhang and Xiren Zhou. Phi-3 technical report: A highly capable language model locally on your phone 2024."},{"key":"e_1_3_2_1_13_1","first-page":"992","volume-title":"International Conference on Machine Learning","author":"Blalock Davis","year":"2021","unstructured":"Davis Blalock and John Guttag. Multiplying matrices without multiplying. In International Conference on Machine Learning, pages 992--1004. PMLR, 2021."},{"key":"e_1_3_2_1_14_1","volume-title":"Quip: 2-bit quantization of large language models with guarantees","author":"Chee Jerry","year":"2024","unstructured":"Jerry Chee, Yaohui Cai, Volodymyr Kuleshov, and Christopher De Sa. Quip: 2-bit quantization of large language models with guarantees, 2024."},{"key":"e_1_3_2_1_15_1","first-page":"579","volume-title":"Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation, OSDI'18","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Meghan Cowan, Haichen Shen, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. Tvm: an automated end-to-end optimizing compiler for deep learning. In Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation, OSDI'18, page 579--594, USA, 2018. USENIX Association."},{"key":"e_1_3_2_1_16_1","volume-title":"Learning to optimize tensor programs. CoRR abs\/1805.08166","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Lianmin Zheng, Eddie Q. Yan, Ziheng Jiang, Thierry Moreau, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. Learning to optimize tensor programs. CoRR abs\/1805.08166, 2018."},{"key":"e_1_3_2_1_17_1","first-page":"30318","article-title":"8-bit matrix multiplication for transformers at scale","volume":"35","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. Gpt3. int8 (): 8-bit matrix multiplication for transformers at scale. Advances in Neural Information Processing Systems, 35:30318--30332, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","first-page":"7750","volume-title":"International Conference on Machine Learning","author":"Dettmers Tim","year":"2023","unstructured":"Tim Dettmers and Luke Zettlemoyer. The case for 4-bit precision: k-bit inference scaling laws. In International Conference on Machine Learning, pages 7750--7774. PMLR, 2023."},{"key":"e_1_3_2_1_19_1","volume-title":"Bitdistiller: Unleashing the potential of sub-4-bit llms via self-distillation","author":"Du Dayou","year":"2024","unstructured":"Dayou Du, Yijia Zhang, Shijie Cao, Jiaqi Guo, Ting Cao, Xiaowen Chu, and Ningyi Xu. Bitdistiller: Unleashing the potential of sub-4-bit llms via self-distillation, 2024."},{"key":"e_1_3_2_1_20_1","volume-title":"Marlin: a fast 4-bit inference kernel for medium batchsizes. https:\/\/github.com\/IST-DASLab\/marlin","author":"Frantar Elias","year":"2024","unstructured":"Elias Frantar and Dan Alistarh. Marlin: a fast 4-bit inference kernel for medium batchsizes. https:\/\/github.com\/IST-DASLab\/marlin, 2024."},{"key":"e_1_3_2_1_21_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323, 2022."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00491"},{"key":"e_1_3_2_1_23_1","volume-title":"High throughput matrix-matrix multiplication between asymmetric bit-width operands","author":"Gope Dibakar","year":"2020","unstructured":"Dibakar Gope, Jesse Beu, and Matthew Mattina. High throughput matrix-matrix multiplication between asymmetric bit-width operands, 2020."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2004.1281665"},{"key":"e_1_3_2_1_26_1","volume-title":"Awq: Activation-aware weight quantization for llm compression and acceleration. arXiv preprint arXiv:2306.00978","author":"Lin Ji","year":"2023","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Xingyu Dang, and Song Han. Awq: Activation-aware weight quantization for llm compression and acceleration. arXiv preprint arXiv:2306.00978, 2023."},{"key":"e_1_3_2_1_27_1","volume-title":"Look-up mai gemm: Increasing ai gemms performance by nearly 2.5 x via msgemm. arXiv preprint arXiv:2310.06178","author":"Maleki Saeed","year":"2023","unstructured":"Saeed Maleki. Look-up mai gemm: Increasing ai gemms performance by nearly 2.5 x via msgemm. arXiv preprint arXiv:2310.06178, 2023."},{"key":"e_1_3_2_1_28_1","volume-title":"Pointer sentinel mixture models","author":"Merity Stephen","year":"2016","unstructured":"Stephen Merity, Caiming Xiong, James Bradbury, and Richard Socher. Pointer sentinel mixture models, 2016."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1144"},{"key":"e_1_3_2_1_30_1","volume-title":"Byeongwook Kim, Youngjoo Lee, and Dongsoo Lee. Lut-gemm: Quantized matrix multiplication based on luts for efficient inference in large-scale generative language models","author":"Park Gunho","year":"2023","unstructured":"Gunho Park, Baeseong Park, Minsub Kim, Sungjae Lee, Jeonghoon Kim, Beomseok Kwon, Se Jung Kwon, Byeongwook Kim, Youngjoo Lee, and Dongsoo Lee. Lut-gemm: Quantized matrix multiplication based on luts for efficient inference in large-scale generative language models, 2023."},{"key":"e_1_3_2_1_31_1","volume-title":"Language models are unsupervised multitask learners","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeff Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. Language models are unsupervised multitask learners. 2019."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3570361.3613285"},{"key":"e_1_3_2_1_33_1","volume-title":"Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805","author":"Team Gemini","year":"2023","unstructured":"Gemini Team, Rohan Anil, Sebastian Borgeaud, Yonghui Wu, Jean-Baptiste Alayrac, Jiahui Yu, Radu Soricut, Johan Schalkwyk, Andrew M Dai, Anja Hauth, et al. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805, 2023."},{"key":"e_1_3_2_1_34_1","volume-title":"Bitnet: Scaling 1-bit transformers for large language models. arXiv preprint arXiv:2310.11453","author":"Wang Hongyu","year":"2023","unstructured":"Hongyu Wang, Shuming Ma, Li Dong, Shaohan Huang, Huaijie Wang, Lingxiao Ma, Fan Yang, Ruiping Wang, Yi Wu, and Furu Wei. Bitnet: Scaling 1-bit transformers for large language models. arXiv preprint arXiv:2310.11453, 2023."},{"key":"e_1_3_2_1_35_1","first-page":"38087","volume-title":"International Conference on Machine Learning","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning, pages 38087--38099. PMLR, 2023."},{"key":"e_1_3_2_1_36_1","volume-title":"Onebit: Towards extremely low-bit large language models. arXiv preprint arXiv:2402.11295","author":"Xu Yuzhuang","year":"2024","unstructured":"Yuzhuang Xu, Xu Han, Zonghan Yang, Shuo Wang, Qingfu Zhu, Zhiyuan Liu, Weidong Liu, and Wanxiang Che. Onebit: Towards extremely low-bit large language models. arXiv preprint arXiv:2402.11295, 2024."},{"key":"e_1_3_2_1_37_1","volume-title":"et al. Yi: Open foundation models by 01. ai. arXiv preprint arXiv:2403.04652","author":"Young Alex","year":"2024","unstructured":"Alex Young, Bei Chen, Chao Li, Chengen Huang, Ge Zhang, Guanwei Zhang, Heng Li, Jiangcheng Zhu, Jianqun Chen, Jing Chang, et al. Yi: Open foundation models by 01. ai. arXiv preprint arXiv:2403.04652, 2024."}],"event":{"name":"EuroSys '25: Twentieth European Conference on Computer Systems","location":"Rotterdam Netherlands","acronym":"EuroSys '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Twentieth European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3696099","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689031.3696099","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:23:25Z","timestamp":1755775405000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3696099"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":37,"alternative-id":["10.1145\/3689031.3696099","10.1145\/3689031"],"URL":"https:\/\/doi.org\/10.1145\/3689031.3696099","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}