{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T16:03:44Z","timestamp":1780675424060,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":91,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T00:00:00Z","timestamp":1760659200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100002418","name":"Intel Corporation","doi-asserted-by":"publisher","award":[""],"award-info":[{"award-number":[""]}],"id":[{"id":"10.13039\/100002418","id-type":"DOI","asserted-by":"publisher"}]},{"name":"DARPA","award":["ACE, one of the seven centers in JUMP 2.0, a Semiconductor Research Corporation (SRC) program"],"award-info":[{"award-number":["ACE, one of the seven centers in JUMP 2.0, a Semiconductor Research Corporation (SRC) program"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756073","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:19:56Z","timestamp":1760721596000},"page":"184-200","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["DECA: A Near-Core LLM Decompression Accelerator Grounded on a 3D Roofline Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7946-2683","authenticated-orcid":false,"given":"Gerasimos","family":"Gerogiannis","sequence":"first","affiliation":[{"name":"Intel, University of Illinois at Urbana-Champaign, Urbana, Illinois, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2587-7541","authenticated-orcid":false,"given":"Stijn","family":"Eyerman","sequence":"additional","affiliation":[{"name":"Intel, Brussels, Belgium"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8738-3532","authenticated-orcid":false,"given":"Evangelos","family":"Georganas","sequence":"additional","affiliation":[{"name":"Intel Labs, Santa Clara, California, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2286-1525","authenticated-orcid":false,"given":"Wim","family":"Heirman","sequence":"additional","affiliation":[{"name":"Intel, Brussels, Belgium"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2595-5228","authenticated-orcid":false,"given":"Josep","family":"Torrellas","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign, Urbana, Illinois, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya et\u00a0al. 2023. GPT-4 Technical Report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_2_3_2","volume-title":"Intel Details Sierra Forest and Granite Rapids Architectures, Xeon Roadmap","author":"Alcorn Paul","year":"2023","unstructured":"Paul Alcorn. 2023. Intel Details Sierra Forest and Granite Rapids Architectures, Xeon Roadmap. https:\/\/www.tomshardware.com\/news\/intel-details-sierra-forest-and-granite-rapids-architecture-xeon-roadmap Accessed: 2025-09-04."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","unstructured":"Rajeev Balasubramonian Andrew\u00a0B Kahng Naveen Muralimanohar Ali Shafiee and Vaishnav Srinivas. 2017. CACTI 7: New tools for interconnect exploration in innovative off-chip memories. ACM Transactions on Architecture and Code Optimization (TACO) 14 2 (2017) 1\u201325.","DOI":"10.1145\/3085572"},{"key":"e_1_3_3_2_5_2","volume-title":"Matrix-multiply Assist Best Practices Guide","author":"Bhat Puneeth","year":"2021","unstructured":"Puneeth Bhat, Jos\u00e9 Moreira, and Satish\u00a0Kumar Sadasivam. 2021. Matrix-multiply Assist Best Practices Guide. Technical Report. IBM, Tech. Rep., 2021.[Online]. Available: htt\u00a0ps:\/\/www.redbooks.ibm.com."},{"key":"e_1_3_3_2_6_2","volume-title":"Hot Chips","author":"Biswas Arijit","year":"2021","unstructured":"Arijit Biswas and Sailesh Kottapalli. 2021. Next-Gen Intel Xeon CPU-Sapphire Rapids. In Hot Chips , Vol.\u00a033."},{"key":"e_1_3_3_2_7_2","unstructured":"Davis Blalock Jose\u00a0Javier Gonzalez\u00a0Ortiz Jonathan Frankle and John Guttag. 2020. What is the state of neural network pruning?Proceedings of machine learning and systems 2 (2020) 129\u2013146."},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3655592"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/2786572.2786579"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Trevor\u00a0E. Carlson Wim Heirman Stijn Eyerman Ibrahim Hur and Lieven Eeckhout. 2014. An Evaluation of High-Level Mechanistic Core Models. ACM Transactions on Architecture and Code Optimization (TACO) 11 3 Article 28 (Aug. 2014) 25\u00a0pages.","DOI":"10.1145\/2629677"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00092"},{"key":"e_1_3_3_2_12_2","unstructured":"Xiongjie Dai. 2025. GPU-Benchmarks-on-LLM-Inference. https:\/\/github.com\/XiongjieDai\/GPU-Benchmarks-on-LLM-Inference"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Jo\u00e3o\u00a0PL de Carvalho Jos\u00e9\u00a0E Moreira and Jos\u00e9\u00a0Nelson Amaral. 2022. Compiling for the IBM matrix engine for enterprise workloads. IEEE Micro 42 5 (2022) 34\u201340.","DOI":"10.1109\/MM.2022.3176529"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Lei Deng Guoqi Li Song Han Luping Shi and Yuan Xie. 2020. Model compression and hardware acceleration for neural networks: A comprehensive survey. Proc. IEEE 108 4 (2020) 485\u2013532.","DOI":"10.1109\/JPROC.2020.2976475"},{"key":"e_1_3_3_2_15_2","unstructured":"Haozheng Fan Hao Zhou Guangtai Huang Parameswaran Raman Xinwei Fu Gaurav Gupta Dhananjay Ram Yida Wang and Jun Huan. 2024. HLAT: High-quality Large Language Model Pre-trained on AWS Trainium. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.10630 (2024)."},{"key":"e_1_3_3_2_16_2","first-page":"10323","volume-title":"International Conference on Machine Learning","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar and Dan Alistarh. 2023. SparseGPT: Massive Language Models Can Be Accurately Pruned in One-shot. In International Conference on Machine Learning. PMLR, 10323\u201310337."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476206"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"crossref","unstructured":"Evangelos Georganas Dhiraj Kalamkar Kirill Voronin Abhisek Kundu Antonio Noack Hans Pabst Alexander Breuer and Alexander Heinecke. 2023. Harnessing Deep Learning and HPC Kernels via High-Level Loop and Tensor Abstractions on CPU Architectures. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.12576 (2023).","DOI":"10.1109\/IPDPS57955.2024.00089"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00081"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589054"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640365"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1201\/9781003162810-13"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358291"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00070"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527403"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00035"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00060"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Song Han Xingyu Liu Huizi Mao Jing Pu Ardavan Pedram Mark\u00a0A Horowitz and William\u00a0J Dally. 2016. EIE: Efficient inference engine on compressed deep neural network. ACM SIGARCH Computer Architecture News 44 3 (2016) 243\u2013254.","DOI":"10.1145\/3007787.3001163"},{"key":"e_1_3_3_2_29_2","unstructured":"Song Han Huizi Mao and William\u00a0J Dally. 2015. Deep compression: Compressing deep neural networks with pruning trained quantization and huffman coding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1510.00149 (2015)."},{"key":"e_1_3_3_2_30_2","unstructured":"Simla\u00a0Burcu Harma Ayan Chakraborty Elizaveta Kostenok Danila Mishin Dongho Ha Babak Falsafi Martin Jaggi Ming Liu Yunho Oh Suvinay Subramanian and Amir Yazdanbakhsh. 2024. Effective Interplay between Sparsity and Quantization: From Theory to Practice. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.20935 (2024)."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358275"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.83"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00047"},{"key":"e_1_3_3_2_35_2","unstructured":"Torsten Hoefler Dan Alistarh Tal Ben-Nun Nikoli Dryden and Alexandra Peste. 2021. Sparsity in deep learning: Pruning and growth for efficient inference and training in neural networks. Journal of Machine Learning Research 22 241 (2021) 1\u2013124."},{"key":"e_1_3_3_2_36_2","unstructured":"Intel. 2022. Accelerate Artificial Intelligence (AI) Workloads with Intel Advanced Matrix Extensions (Intel AMX). https:\/\/www.intel.com\/content\/dam\/www\/central-libraries\/us\/en\/documents\/2022-12\/accelerate-ai-with-amx-sb.pdf"},{"key":"e_1_3_3_2_37_2","volume-title":"Intel\u00ae 64 and IA-32 Architectures Optimization Reference Manual","year":"2023","unstructured":"Intel 2023. Intel\u00ae 64 and IA-32 Architectures Optimization Reference Manual. Intel. Chapter 20: Intel AMX, Section 20.17.2 \u2013 Intel\u00ae Hyper-Threading Technology. Available at https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/articles\/technical\/intel-sdm.html."},{"key":"e_1_3_3_2_38_2","volume-title":"Intel\u00ae 64 and IA-32 Architectures Optimization Reference Manual","year":"2024","unstructured":"Intel. 2024. Intel\u00ae 64 and IA-32 Architectures Optimization Reference Manual."},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00064"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071058"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586257"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_3_2_43_2","unstructured":"Christoforos Kachris. 2024. A Survey on Hardware Accelerators for Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.09890 (2024)."},{"key":"e_1_3_3_2_44_2","unstructured":"Dhiraj Kalamkar Dheevatsa Mudigere Naveen Mellempudi Dipankar Das Kunal Banerjee Sasikanth Avancha Dharma\u00a0Teja Vooturi Nataraj Jammalamadaka Jianyu Huang Hector Yuen Jiyan Yang Jongsoo Park Alexander Heinecke Evangelos Georganas Sudarshan Srinivasan Abhisek Kundu Misha Smelyanskiy Bharat Kaul and Pradeep Dubey. 2019. A study of BFLOAT16 for deep learning training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1905.12322 (2019)."},{"key":"e_1_3_3_2_45_2","unstructured":"Dinesh Kalla Nathan Smith Fnu Samaah and Sivaraju Kuraku. 2023. Study and analysis of chat GPT and its impact on different fields of study. International journal of innovative science and research technology 8 3 (2023)."},{"key":"e_1_3_3_2_46_2","unstructured":"Jeonghoon Kim Jung\u00a0Hyun Lee Sungdong Kim Joonsuk Park Kang\u00a0Min Yoo Se\u00a0Jung Kwon and Dongsoo Lee. 2024. Memory-efficient fine-tuning of compressed large language models via sub-4-bit integer quantization. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_2_47_2","unstructured":"Yann LeCun John Denker and Sara Solla. 1989. Optimal brain damage. Advances in neural information processing systems 2 (1989)."},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"crossref","unstructured":"Tailin Liang John Glossner Lei Wang Shaobo Shi and Xiaotong Zhang. 2021. Pruning and quantization for deep neural network acceleration: A survey. Neurocomputing 461 (2021) 370\u2013403.","DOI":"10.1016\/j.neucom.2021.07.045"},{"key":"e_1_3_3_2_49_2","unstructured":"Ji Lin Jiaming Tang Haotian Tang Shang Yang Wei-Ming Chen Wei-Chen Wang Guangxuan Xiao Xingyu Dang Chuang Gan and Song Han. 2023. AWQ: Activation-aware weight quantization for LLM compression and acceleration. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.00978 (2023)."},{"key":"e_1_3_3_2_50_2","first-page":"7021","volume-title":"International Conference on Machine Learning","author":"Liu Liyang","year":"2021","unstructured":"Liyang Liu, Shilong Zhang, Zhanghui Kuang, Aojun Zhou, Jing-Hao Xue, Xinjiang Wang, Yimin Chen, Wenming Yang, Qingmin Liao, and Wayne Zhang. 2021. Group Fisher pruning for practical network compression. In International Conference on Machine Learning. PMLR, 7021\u20137032."},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480125"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2019.00013"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"crossref","unstructured":"Weile Luo Ruibo Fan Zeyu Li Dayou Du Qiang Wang and Xiaowen Chu. 2024. Benchmarking and dissecting the NVIDIA Hopper GPU architecture. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.13499 (2024).","DOI":"10.1109\/IPDPS57955.2024.00064"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2018.00091"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42614.2022.9731107"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","unstructured":"Thomas Norrie Nishant Patil Doe\u00a0Hyun Yoon George Kurian Sheng Li James Laudon Cliff Young Norman Jouppi and David Patterson. 2021. The Design Process for Google\u2019s Training Chips: TPUv2 and TPUv3. IEEE Micro 41 2 (2021) 56\u201363. 10.1109\/MM.2021.3058217","DOI":"10.1109\/MM.2021.3058217"},{"key":"e_1_3_3_2_57_2","unstructured":"NVIDIA. 2024. NVIDIA Blackwell Architecture Technical Brief. Retrieved 2024 from https:\/\/resources.nvidia.com\/en-us-blackwell-architecture"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527400"},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/264107.264201"},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"crossref","unstructured":"Angshuman Parashar Minsoo Rhu Anurag Mukkara Antonio Puglielli Rangharajan Venkatesan Brucek Khailany Joel Emer Stephen\u00a0W Keckler and William\u00a0J Dally. 2017. SCNN: An accelerator for compressed-sparse convolutional neural networks. ACM SIGARCH computer architecture news 45 2 (2017) 27\u201340.","DOI":"10.1145\/3140659.3080254"},{"key":"e_1_3_3_2_61_2","unstructured":"Pratyush Patel Esha Choukse Chaojie Zhang \u00cd\u00f1igo Goiri Aashaka Shah Saeed Maleki and Ricardo Bianchini. 2023. Splitwise: Efficient generative LLM inference using phase splitting. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.18677 (2023)."},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"crossref","unstructured":"Christodoulos Peltekis Vasileios Titopoulos Chrysostomos Nicopoulos and Giorgos Dimitrakopoulos. 2024. DeMM: A Decoupled Matrix Multiplication Engine Supporting Relaxed Structured Sparsity. IEEE Computer Architecture Letters (2024).","DOI":"10.1109\/LCA.2024.3355178"},{"key":"e_1_3_3_2_63_2","unstructured":"Alexandra Peste Eugenia Iofinova Adrian Vladu and Dan Alistarh. 2021. AC\/DC: Alternating compressed\/decompressed training of deep neural networks. Advances in neural information processing systems 34 (2021) 8557\u20138570."},{"key":"e_1_3_3_2_64_2","unstructured":"Bita\u00a0Darvish Rouhani Nitin Garegrat Tom Savell Ankit More Kyung-Nam Han et\u00a0al. 2023. OCP Microscaling Formats (MX) Specification. https:\/\/www.opencompute.org\/documents\/ocp-microscaling-formats-mx-v1-0-spec-final-pdf"},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586094"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"crossref","unstructured":"Sungju Ryu Hyungjun Kim Wooseok Yi Eunhwan Kim Yulhwa Kim Taesu Kim and Jae-Joon Kim. 2022. BitBlade: Energy-efficient variable bit-precision hardware accelerator for quantized neural networks. IEEE Journal of Solid-State Circuits 57 6 (2022) 1924\u20131935.","DOI":"10.1109\/JSSC.2022.3141050"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614284"},{"key":"e_1_3_3_2_68_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00062"},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"crossref","unstructured":"Nigel Stephens Stuart Biles Matthias Boettcher Jacob Eapen Mbou Eyole Giacomo Gabrielli Matt Horsnell Grigorios Magklis Alejandro Martinez Nathanael Premillieu et\u00a0al. 2017. The ARM Scalable Vector Extension. IEEE micro 37 2 (2017) 26\u201339.","DOI":"10.1109\/MM.2017.35"},{"key":"e_1_3_3_2_70_2","doi-asserted-by":"crossref","unstructured":"Aaron Stillmaker and Bevan Baas. 2017. Scaling equations for the accurate prediction of CMOS device performance from 180 nm to 7 nm. Integration the VLSI Journal 58 (2017) 74\u201381. http:\/\/vcl.ece.ucdavis.edu\/pubs\/2017.02.VLSIintegration.TechScale\/.","DOI":"10.1016\/j.vlsi.2017.02.002"},{"key":"e_1_3_3_2_71_2","unstructured":"Jovan Stojkovic Esha Choukse Chaojie Zhang Inigo Goiri and Josep Torrellas. 2024. Towards Greener LLMs: Bringing Energy-efficiency to the Forefront of LLM Inference. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.20306 (2024)."},{"key":"e_1_3_3_2_72_2","unstructured":"Qidong Su Christina Giannoula and Gennady Pekhimenko. 2023. The synergy of speculative decoding and batching in serving large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.18813 (2023)."},{"key":"e_1_3_3_2_73_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi et\u00a0al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09288 (2023)."},{"key":"e_1_3_3_2_74_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_2_75_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"e_1_3_3_2_76_2","unstructured":"Xiuying Wei Yunchen Zhang Yuhang Li Xiangguo Zhang Ruihao Gong Jinyang Guo and Xianglong Liu. 2023. Outlier suppression+: Accurate quantization of large language models by equivalent and optimal shifting and scaling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.09145 (2023)."},{"key":"e_1_3_3_2_77_2","unstructured":"Wikipedia. 2024. Sapphire Rapids Die Configurations. https:\/\/en.wikipedia.org\/wiki\/Sapphire_Rapids#Die_configurations."},{"key":"e_1_3_3_2_78_2","doi-asserted-by":"publisher","DOI":"10.1109\/PMBS56514.2022.00018"},{"key":"e_1_3_3_2_79_2","doi-asserted-by":"crossref","unstructured":"Samuel Williams Andrew Waterman and David Patterson. 2009. Roofline: An insightful visual performance model for multicore architectures. Commun. ACM 52 4 (2009) 65\u201376.","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_3_2_80_2","doi-asserted-by":"crossref","unstructured":"Haojun Xia Zhen Zheng Yuchao Li Donglin Zhuang Zhongzhu Zhou Xiafei Qiu Yong Li Wei Lin and Shuaiwen\u00a0Leon Song. 2023. Flash-LLM: Enabling Cost-Effective and Highly-Efficient Large Generative Model Inference with Unstructured Sparsity. Proceedings of the VLDB Endowment 17 2 (2023) 211\u2013224.","DOI":"10.14778\/3626292.3626303"},{"key":"e_1_3_3_2_81_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00072"},{"key":"e_1_3_3_2_82_2","unstructured":"Binwei Yao Ming Jiang Diyi Yang and Junjie Hu. 2023. Empowering LLM-based machine translation with cultural awareness. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.14328 (2023)."},{"key":"e_1_3_3_2_83_2","unstructured":"Zhihang Yuan Yuzhang Shang Yang Zhou Zhen Dong Zhe Zhou Chenhao Xue Bingzhe Wu Zhikai Li Qingyi Gu Yong\u00a0Jae Lee Yan Yan Beidi Chen Guangyu Sun and Kurt Keutzer. 2024. LLM Inference Unveiled: Survey and Roofline Model Insights. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.16363 (2024)."},{"key":"e_1_3_3_2_84_2","doi-asserted-by":"crossref","unstructured":"Hao Zhang Dongdong Chen and Seok-Bum Ko. 2019. New flexible multiple-precision multiply-accumulate unit for deep neural network training and inference. IEEE Trans. Comput. 69 1 (2019) 26\u201338.","DOI":"10.1109\/TC.2019.2936192"},{"key":"e_1_3_3_2_85_2","doi-asserted-by":"crossref","unstructured":"Haopeng Zhang Xiao Liu and Jiawei Zhang. 2023. SummIt: Iterative text summarization via chatGPT. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.14835 (2023).","DOI":"10.18653\/v1\/2023.findings-emnlp.714"},{"key":"e_1_3_3_2_86_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783723"},{"key":"e_1_3_3_2_87_2","unstructured":"Susan Zhang Stephen Roller Naman Goyal Mikel Artetxe Moya Chen Shuohui Chen Christopher Dewan Mona Diab Xian Li Xi\u00a0Victoria Lin Todor Mihaylov Myle Ott Sam Shleifer Kurt Shuster Daniel Simig Punit\u00a0Singh Koura Anjali Sridhar Tianlu Wang and Luke Zettlemoyer. 2022. OPT: Open pre-trained transformer language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2205.01068 (2022)."},{"key":"e_1_3_3_2_88_2","unstructured":"Wayne\u00a0Xin Zhao Kun Zhou Junyi Li Tianyi Tang Xiaolei Wang Yupeng Hou Yingqian Min Beichen Zhang Junjie Zhang Zican Dong Yifan Du Chen Yang Yushuo Chen Zhipeng Chen Jinhao Jiang Ruiyang Ren Yifan Li Xinyu Tang Zikang Liu Peiyu Liu Jian-Yun Nie and Ji-Rong Wen. 2023. A survey of large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.18223 (2023)."},{"key":"e_1_3_3_2_89_2","unstructured":"Yilong Zhao Chien-Yu Lin Kan Zhu Zihao Ye Lequn Chen Size Zheng Luis Ceze Arvind Krishnamurthy Tianqi Chen and Baris Kasikci. 2024. Atom: Low-bit quantization for efficient and accurate LLM serving. Proceedings of Machine Learning and Systems 6 (2024) 196\u2013209."},{"key":"e_1_3_3_2_90_2","doi-asserted-by":"crossref","unstructured":"Pengyuan Zhou Lin Wang Zhi Liu Yanbin Hao Pan Hui Sasu Tarkoma and Jussi Kangasharju. 2024. A survey on generative AI and LLM for video generation understanding and streaming. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.16038 (2024).","DOI":"10.36227\/techrxiv.171172801.19993069\/v1"},{"key":"e_1_3_3_2_91_2","unstructured":"Xunyu Zhu Jian Li Yong Liu Can Ma and Weiping Wang. 2023. A survey on model compression for large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.07633 (2023)."},{"key":"e_1_3_3_2_92_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00020"}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756073","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756073","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:42:16Z","timestamp":1769463736000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756073"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":91,"alternative-id":["10.1145\/3725843.3756073","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756073","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}