{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T16:37:06Z","timestamp":1773247026727,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":113,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T00:00:00Z","timestamp":1760659200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2316177"],"award-info":[{"award-number":["2316177"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756037","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:19:56Z","timestamp":1760721596000},"page":"1592-1608","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["MCBP: A Memory-Compute Efficient LLM Inference Accelerator Leveraging Bit-Slice-enabled Sparsity and Repetitiveness"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9763-8208","authenticated-orcid":false,"given":"Huizheng","family":"Wang","sequence":"first","affiliation":[{"name":"School of Integrated Circuits, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7114-0647","authenticated-orcid":false,"given":"Zichuan","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4084-3478","authenticated-orcid":false,"given":"Zhiheng","family":"Yue","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0326-2029","authenticated-orcid":false,"given":"Yousheng","family":"Long","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3501-3148","authenticated-orcid":false,"given":"Taiquan","family":"Wei","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9905-0961","authenticated-orcid":false,"given":"Jianxun","family":"Yang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8293-8881","authenticated-orcid":false,"given":"Yang","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6218-4659","authenticated-orcid":false,"given":"Chao","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5117-7920","authenticated-orcid":false,"given":"Shaojun","family":"Wei","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6942-4395","authenticated-orcid":false,"given":"Yang","family":"Hu","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8438-8588","authenticated-orcid":false,"given":"Shouyi","family":"Yin","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Tsinghua University, Beijing, China and Shanghai Artificial Intelligence Laboratory, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Mikl\u00f3s Ajtai. 1994. The complexity of the pigeonhole principle. Combinatorica 14 (1994) 417\u2013433.","DOI":"10.1007\/BF01302964"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123982"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISVLSI.2016.111"},{"key":"e_1_3_3_1_5_2","unstructured":"Rohan Anil Andrew\u00a0M Dai Orhan Firat Melvin Johnson Dmitry Lepikhin Alexandre Passos Siamak Shakeri Emanuel Taropa Paige Bailey Zhifeng Chen et\u00a0al. 2023. Palm 2 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.10403 (2023)."},{"key":"e_1_3_3_1_6_2","unstructured":"Jacob Austin Augustus Odena Maxwell Nye Maarten Bosma Henryk Michalewski David Dohan Ellen Jiang Carrie Cai Michael Terry Quoc Le et\u00a0al. 2021. Program synthesis with large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2108.07732 (2021)."},{"key":"e_1_3_3_1_7_2","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang Binyuan Hui Luo Ji Mei Li Junyang Lin Runji Lin Dayiheng Liu Gao Liu Chengqiang Lu Keming Lu Jianxin Ma Rui Men Xingzhang Ren Xuancheng Ren Chuanqi Tan Sinan Tan Jianhong Tu Peng Wang Shijie Wang Wei Wang Shengguang Wu Benfeng Xu Jin Xu An Yang Hao Yang Jian Yang Shusheng Yang Yang Yao Bowen Yu Hongyi Yuan Zheng Yuan Jianwei Zhang Xingxuan Zhang Yichang Zhang Zhenru Zhang Chang Zhou Jingren Zhou Xiaohuan Zhou and Tianhang Zhu. 2023. Qwen technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.16609 (2023)."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3649329.3658488"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Lukas Cavigelli and Luca Benini. 2016. Origami: A 803-GOp\/s\/W convolutional network accelerator. IEEE Transactions on Circuits and Systems for Video Technology 27 11 (2016) 2461\u20132475.","DOI":"10.1109\/TCSVT.2016.2592330"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Yupeng Chang Xu Wang Jindong Wang Yuan Wu Linyi Yang Kaijie Zhu Hao Chen Xiaoyuan Yi Cunxiang Wang Yidong Wang Wei Ye Yue Zhang Yi Chang Philip\u00a0S. Yu Qiang Yang and Xing Xie. 2024. A survey on evaluation of large language models. ACM Transactions on Intelligent Systems and Technology 15 3 (2024) 1\u201345.","DOI":"10.1145\/3641289"},{"key":"e_1_3_3_1_11_2","unstructured":"Mark Chen Jerry Tworek Heewoo Jun Qiming Yuan Henrique Ponde De\u00a0Oliveira Pinto Jared Kaplan Harri Edwards Yuri Burda Nicholas Joseph Greg Brockman Alex Ray Raul Puri Gretchen Krueger Michael Petrov Heidy Khlaaf Girish Sastry Pamela Mishkin Brooke Chan Scott Gray Nick Ryder Mikhail Pavlov Alethea Power Lukasz Kaiser Mohammad Bavarian Clemens Winter Philippe Tillet Felipe Petroski\u00a0Such Dave Cummings Matthias Plappert Fotios Chantzis Elizabeth Barnes Ariel Herbert-Voss William Hebgen\u00a0Guss Alex Nichol Alex Paino Nikolas Tezak Jie Tang Igor Babuschkin Suchir Balaji Shantanu Jain William Saunders Christopher Hesse Andrew N.\u00a0Carr Jan Leike Josh Achiam Vedant Misra Evan Morikawa Alec Radford Matthew Knight Miles Brundage Mira Murati Katie Mayer Peter Welinder Bob McGrew Dario Amodei Sam McCandlish Ilya Sutskever and Wojciech Zaremba. 2021. Evaluating large language models trained on code. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2107.03374 (2021)."},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00047"},{"key":"e_1_3_3_1_13_2","unstructured":"Wei-Lin Chiang Zhuohan Li Zi Lin Ying Sheng Zhanghao Wu Hao Zhang Lianmin Zheng Siyuan Zhuang Yonghao Zhuang Joseph\u00a0E Gonzalez et\u00a0al. 2023. Vicuna: An open-source chatbot impressing GPT-4 with 90%* ChatGPT quality. See https:\/\/vicuna. lmsys. org (accessed 14 April 2023) 2 3 (2023) 6."},{"key":"e_1_3_3_1_14_2","unstructured":"Mike Conover Matt Hayes Ankit Mathur Jianwei Xie Jun Wan Sam Shah Ali Ghodsi Patrick Wendell Matei Zaharia and Reynold Xin. 2023. Free Dolly: Introducing the world\u2019s first truly open instruction-tuned LLM. Company Blog of Databricks (2023)."},{"key":"e_1_3_3_1_15_2","unstructured":"Alberto Delmas Patrick Judd Sayeh Sharify and Andreas Moshovos. 2017. Dynamic stripes: Exploiting the dynamic precision requirements of activation values in neural networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1706.00504 (2017)."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304041"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00090"},{"key":"e_1_3_3_1_18_2","unstructured":"Tim Dettmers Mike Lewis Younes Belkada and Luke Zettlemoyer. 2022. GPT3.Int8 (): 8-bit matrix multiplication for Transformers at scale. Advances in Neural Information Processing Systems 35 (2022) 30318\u201330332."},{"key":"e_1_3_3_1_19_2","volume-title":"Findings of EMNLP, 2020","author":"Esin\u00a0Durmus Claire\u00a0Cardie Faisal\u00a0Ladhak,","year":"2020","unstructured":"Claire\u00a0Cardie Faisal\u00a0Ladhak, Esin\u00a0Durmus and Kathleen McKeown. 2020. WikiLingua: A new benchmark dataset for multilingual abstractive summarization. In Findings of EMNLP, 2020."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00050"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589040"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS48785.2022.9937659"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Chao Fang Aojun Zhou and Zhongfeng Wang. 2022. An algorithm-hardware co-optimized framework for accelerating N:M sparse Transformers. IEEE Transactions on Very Large Scale Integration (VLSI) Systems 30 11 (2022) 1573\u20131586.","DOI":"10.1109\/TVLSI.2022.3197282"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358291"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00069"},{"key":"e_1_3_3_1_26_2","first-page":"328","volume-title":"Proceedings of the IEEE International Symposium on High Performance Computer Architecture (HPCA)","author":"Ham Tae\u00a0Jun","year":"2020","unstructured":"Tae\u00a0Jun Ham, Sung\u00a0Jun Jung, Seonghak Kim, Young\u00a0H Oh, Yeonhong Park, Yoonho Song, Jung-Hun Park, Sanghee Lee, Kyoung Park, Jae\u00a0W Lee, and Deog-Kyoon Jeong. 2020. A3: Accelerating attention mechanisms in neural networks with approximation. In Proceedings of the IEEE International Symposium on High Performance Computer Architecture (HPCA). 328\u2013341."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00060"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00095"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Song Han Xingyu Liu Huizi Mao Jing Pu Ardavan Pedram Mark\u00a0A Horowitz and William\u00a0J Dally. 2016. EIE: Efficient inference engine on compressed deep neural network. ACM SIGARCH Computer Architecture News 44 3 (2016) 243\u2013254.","DOI":"10.1145\/3007787.3001163"},{"key":"e_1_3_3_1_30_2","unstructured":"Song Han Huizi Mao and William\u00a0J Dally. 2015. Deep compression: Compressing deep neural networks with pruning trained quantization and huffman coding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1510.00149 (2015)."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527419"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00062"},{"key":"e_1_3_3_1_33_2","unstructured":"Dan Hendrycks Collin Burns Steven Basart Andy Zou Mantas Mazeika Dawn Song and Jacob Steinhardt. 2020. Measuring massive multitask language understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2009.03300 (2020)."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00051"},{"key":"e_1_3_3_1_35_2","unstructured":"Yuxuan Hu Xiaodong Chen Cuiping Li Hong Chen and Jing Zhang. 2025. QUAD: Quantization and Parameter-Efficient Tuning of LLM with Activation Decomposition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.19353 (2025)."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071031"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00286"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783722"},{"key":"e_1_3_3_1_39_2","unstructured":"Dongyun Kam Myeongji Yun Sunwoo Yoo Seungwoo Hong Zhengya Zhang and Youngjoo Lee. 2024. Panacea: Novel DNN Accelerator using Accuracy-Preserving Asymmetric Quantization and Energy-Saving Bit-Slice Sparsity. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.10059 (2024)."},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Sanghoon Kang Donghyeon Han Juhyoung Lee Dongseok Im Sangyeob Kim Soyeon Kim Junha Ryu and Hoi-Jun Yoo. 2021. GANPU: An energy-efficient multi-DNN training processor for GANs with speculative dual-sparsity exploitation. IEEE Journal of Solid-State Circuits 56 9 (2021) 2845\u20132857.","DOI":"10.1109\/JSSC.2021.3066572"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642773"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"Yoongu Kim Weikun Yang and Onur Mutlu. 2015. Ramulator: A fast and extensible DRAM simulator. IEEE Computer architecture letters 15 1 (2015) 45\u201349.","DOI":"10.1109\/LCA.2015.2414456"},{"key":"e_1_3_3_1_43_2","unstructured":"Teven Le\u00a0Scao Angela Fan Christopher Akiki Ellie Pavlick Suzana Ili\u0107 Daniel Hesslow Roman Castagn\u00e9 Alexandra\u00a0Sasha Luccioni Fran\u00e7ois Yvon Matthias Gall\u00e9 et\u00a0al. 2022. Bloom: A 176B-parameter open-access multilingual language model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.05100 (2022)."},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2018.8310262"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Brian Leibowitz Robert Palmer John Poulton Yohan Frans Simon Li John Wilson Michael Bucher Andrew\u00a0M Fuller John Eyles Marko Aleksic Trey Greer and Nhat\u00a0M Nguyen. 2010. A 4.3 GB\/s mobile memory interface with power-efficient bandwidth scaling. IEEE Journal of Solid-State Circuits 45 4 (2010) 889\u2013898.","DOI":"10.1109\/JSSC.2010.2040230"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527404"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3370748.3406567"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00097"},{"key":"e_1_3_3_1_49_2","unstructured":"Guoyu Li Shengyu Ye Chunyun Chen Yang Wang Fan Yang Ting Cao Cheng Liu Mohamed\u00a0M Sabry and Mao Yang. 2025. LUT-DLA: Lookup Table as Efficient Extreme Low-Bit Deep Learning Accelerator. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.10658 (2025)."},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480043"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527423"},{"key":"e_1_3_3_1_52_2","unstructured":"Bin Lin Chen Zhang Tao Peng Hanyu Zhao Wencong Xiao Minmin Sun Anmin Liu Zhipeng Zhang Lanbo Li Xiafei Qiu Li Shen Zhigang Ji Tao Xie Yong Li and Wei Lin. 2024. Infinite-LLM: Efficient LLM service for long context with distattention and distributed kvcache. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.02669 (2024)."},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00082"},{"key":"e_1_3_3_1_54_2","unstructured":"Jing Liu Ruihao Gong Xiuying Wei Zhiwei Dong Jianfei Cai and Bohan Zhuang. 2023. QLLM: Accurate and efficient low-bitwidth quantization for large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.08041 (2023)."},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1145\/3649476.3658709"},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42615.2023.10067360"},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00049"},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247749"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480123"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480125"},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00069"},{"key":"e_1_3_3_1_62_2","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Merity Stephen","year":"2016","unstructured":"Stephen Merity, Caiming Xiong, James Bradbury, and Richard Socher. 2016. Pointer sentinel mixture models. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_3_1_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2017.7870353"},{"key":"e_1_3_3_1_64_2","doi-asserted-by":"crossref","unstructured":"Bert Moons and Marian Verhelst. 2016. An energy-efficient precision-scalable ConvNet processor in 40-nm CMOS. IEEE Journal of solid-state Circuits 52 4 (2016) 903\u2013914.","DOI":"10.1109\/JSSC.2016.2636225"},{"key":"e_1_3_3_1_65_2","doi-asserted-by":"crossref","unstructured":"Naveen Muralimanohar Rajeev Balasubramonian and Norman\u00a0P Jouppi. 2009. CACTI 6.0: A tool to model large caches. HP laboratories 27 (2009) 28.","DOI":"10.1109\/MM.2008.2"},{"key":"e_1_3_3_1_66_2","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3639187"},{"key":"e_1_3_3_1_67_2","unstructured":"Nvidia. 2023. TensorRT-LLM. https:\/\/github.com\/NVIDIA\/TensorRT-LLM?tab=readme-ov-file."},{"key":"e_1_3_3_1_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3124545"},{"key":"e_1_3_3_1_69_2","doi-asserted-by":"crossref","unstructured":"Angshuman Parashar Minsoo Rhu Anurag Mukkara Antonio Puglielli Rangharajan Venkatesan Brucek Khailany Joel Emer Stephen\u00a0W Keckler and William\u00a0J Dally. 2017. SCNN: An accelerator for compressed-sparse convolutional neural networks. ACM SIGARCH computer architecture news 45 2 (2017) 27\u201340.","DOI":"10.1145\/3140659.3080254"},{"key":"e_1_3_3_1_70_2","unstructured":"Adam Paszke Sam Gross Soumith Chintala Gregory Chanan Edward Yang Zachary DeVito Zeming Lin Alban Desmaison Luca Antiga and Adam Lerer. 2017. Automatic differentiation in PyTorch. (2017)."},{"key":"e_1_3_3_1_71_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_3_1_72_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"e_1_3_3_1_73_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589057"},{"key":"e_1_3_3_1_74_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC49661.2025.10904774"},{"key":"e_1_3_3_1_75_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00079"},{"key":"e_1_3_3_1_76_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507738"},{"key":"e_1_3_3_1_77_2","unstructured":"Baptiste Roziere Jonas Gehring Fabian Gloeckle Sten Sootla Itai Gat Xiaoqing\u00a0Ellen Tan Yossi Adi Jingyu Liu Romain Sauvestre Tal Remez et\u00a0al. 2023. Code LLaMa: Open foundation models for code. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.12950 (2023)."},{"key":"e_1_3_3_1_78_2","doi-asserted-by":"crossref","unstructured":"Keisuke Sakaguchi Ronan\u00a0Le Bras Chandra Bhagavatula and Yejin Choi. 2021. Winogrande: An adversarial Winograd schema challenge at scale. Commun. ACM 64 9 (2021) 99\u2013106.","DOI":"10.1145\/3474381"},{"key":"e_1_3_3_1_79_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322255"},{"key":"e_1_3_3_1_80_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00069"},{"key":"e_1_3_3_1_81_2","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530504"},{"key":"e_1_3_3_1_82_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00062"},{"key":"e_1_3_3_1_83_2","first-page":"861","volume-title":"Proceedings of the IEEE International Symposium on High-Performance Computer Architecture (HPCA)","author":"Shin Jong\u00a0Hoon","year":"2022","unstructured":"Jong\u00a0Hoon Shin, Ali Shafiee, Ardavan Pedram, Hamzah Abdel-Aziz, Ling Li, and Joseph Hassoun. 2022. Griffin: Rethinking sparse optimization for deep learning architectures. In Proceedings of the IEEE International Symposium on High-Performance Computer Architecture (HPCA). 861\u2013875."},{"key":"e_1_3_3_1_84_2","volume-title":"North American SystemC Users\u2019 Group, Design Automation Conference","author":"Snyder Wilson","year":"2004","unstructured":"Wilson Snyder. 2004. Verilator and SystemPerl. In North American SystemC Users\u2019 Group, Design Automation Conference."},{"key":"e_1_3_3_1_85_2","unstructured":"Benjamin Spector and Chris Re. 2023. Accelerating LLM inference with staged speculative decoding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.04623 (2023)."},{"key":"e_1_3_3_1_86_2","unstructured":"Salmonn Talebi Elizabeth Tong and Mohammad\u00a0RK Mofrad. 2023. Beyond the Hype: Assessing the Performance Trustworthiness and Clinical Suitability of GPT3. 5. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.15887 (2023)."},{"key":"e_1_3_3_1_87_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480095"},{"key":"e_1_3_3_1_88_2","unstructured":"Rohan Taori Ishaan Gulrajani Tianyi Zhang Yann Dubois Xuechen Li Carlos Guestrin Percy Liang and Tatsunori\u00a0B Hashimoto. 2023. Stanford alpaca: An instruction-following llama model. https:\/\/crfm.stanford.edu\/2023\/03\/13\/alpaca.html."},{"key":"e_1_3_3_1_89_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et\u00a0al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09288 (2023)."},{"key":"e_1_3_3_1_90_2","doi-asserted-by":"crossref","unstructured":"Shikhar Tuli and Niraj\u00a0K Jha. 2023. AccelTran: A sparsity-aware accelerator for dynamic inference with Transformers. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 42 11 (2023) 4038\u20134051.","DOI":"10.1109\/TCAD.2023.3273992"},{"key":"e_1_3_3_1_91_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_92_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5446"},{"key":"e_1_3_3_1_93_2","doi-asserted-by":"crossref","unstructured":"Huizheng Wang Jiahao Fang Xinru Tang Zhiheng Yue Jinxi Li Yubin Qin Sihan Guan Qize Yang Yang Wang Chao Li Yang Hu and Shouyi Yin. 2024. SOFA: A compute-memory optimized sparsity accelerator via cross-stage coordinated tiling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.10416 (2024).","DOI":"10.1109\/MICRO61859.2024.00093"},{"key":"e_1_3_3_1_94_2","doi-asserted-by":"crossref","unstructured":"Huizheng Wang Weihong Xu Zaichen Zhang Xiaohu You and Chuan Zhang. 2021. An efficient stochastic convolution architecture based on fast FIR algorithm. IEEE Transactions on Circuits and Systems II: Express Briefs 69 3 (2021) 984\u2013988.","DOI":"10.1109\/TCSII.2021.3121081"},{"key":"e_1_3_3_1_95_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"e_1_3_3_1_96_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDSP.2018.8631556"},{"key":"e_1_3_3_1_97_2","doi-asserted-by":"crossref","unstructured":"Yizhi Wang Jun Lin and Zhongfeng Wang. 2017. An energy-efficient architecture for binary weight convolutional neural networks. IEEE Transactions on Very Large Scale Integration (VLSI) Systems 26 2 (2017) 280\u2013293.","DOI":"10.1109\/TVLSI.2017.2767624"},{"key":"e_1_3_3_1_98_2","doi-asserted-by":"crossref","unstructured":"Yang Wang Yubin Qin Dazheng Deng Jingchuan Wei Yang Zhou Yuanqi Fan Tianbao Chen Hao Sun Leibo Liu Shaojun Wei and Shouyi Yin. 2022. An energy-efficient Transformer processor exploiting dynamic weak relevances in global attention. IEEE Journal of Solid-State Circuits 58 1 (2022) 227\u2013242.","DOI":"10.1109\/JSSC.2022.3213521"},{"key":"e_1_3_3_1_99_2","unstructured":"Xiuying Wei Yunchen Zhang Xiangguo Zhang Ruihao Gong Shanghang Zhang Qi Zhang Fengwei Yu and Xianglong Liu. 2022. Outlier suppression: Pushing the limit of low-bit transformer language models. Advances in Neural Information Processing Systems 35 (2022) 17402\u201317414."},{"key":"e_1_3_3_1_100_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_3_1_101_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623786"},{"key":"e_1_3_3_1_102_2","first-page":"38087","volume-title":"International Conference on Machine Learning","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning. PMLR, 38087\u201338099."},{"key":"e_1_3_3_1_103_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00064"},{"key":"e_1_3_3_1_104_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00079"},{"key":"e_1_3_3_1_105_2","doi-asserted-by":"crossref","unstructured":"Tao Yang Fei Ma Xiaoling Li Fangxin Liu Yilong Zhao Zhezhi He and Li Jiang. 2022. DTATrans: Leveraging dynamic token-based quantization with accuracy compensation mechanism for efficient Transformer architecture. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 42 2 (2022) 509\u2013520.","DOI":"10.1109\/TCAD.2022.3181541"},{"key":"e_1_3_3_1_106_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071080"},{"key":"e_1_3_3_1_107_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00059"},{"key":"e_1_3_3_1_108_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247799"},{"key":"e_1_3_3_1_109_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071027"},{"key":"e_1_3_3_1_110_2","doi-asserted-by":"publisher","DOI":"10.1109\/EMC2-NIPS53020.2019.00016"},{"key":"e_1_3_3_1_111_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783723"},{"key":"e_1_3_3_1_112_2","unstructured":"Susan Zhang Stephen Roller Naman Goyal Mikel Artetxe Moya Chen Shuohui Chen Christopher Dewan Mona Diab Xian Li Xi\u00a0Victoria Lin Todor Mihaylov Myle Ott Sam Shleifer Kurt Shuster Daniel Simig Punit\u00a0Singh Koura Anjali Sridhar Tianlu Wang and Luke Zettlemoyer. 2022. OPT: Open pre-trained transformer language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2205.01068 (2022)."},{"key":"e_1_3_3_1_113_2","unstructured":"Yilong Zhao Chien-Yu Lin Kan Zhu Zihao Ye Lequn Chen Size Zheng Luis Ceze Arvind Krishnamurthy Tianqi Chen and Baris Kasikci. 2024. Atom: Low-bit quantization for efficient and accurate llm serving. Proceedings of Machine Learning and Systems 6 (2024) 196\u2013209."},{"key":"e_1_3_3_1_114_2","doi-asserted-by":"crossref","unstructured":"Zhe Zhou Junlin Liu Zhenyu Gu and Guangyu Sun. 2022. Energon: Toward efficient acceleration of Transformers using dynamic sparse attention. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 42 1 (2022) 136\u2013149.","DOI":"10.1109\/TCAD.2022.3170848"}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756037","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756037","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:46:59Z","timestamp":1769464019000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756037"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":113,"alternative-id":["10.1145\/3725843.3756037","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756037","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}