{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T14:13:51Z","timestamp":1773843231438,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T00:00:00Z","timestamp":1750377600000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2112562"],"award-info":[{"award-number":["2112562"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000183","name":"Army Research Office","doi-asserted-by":"publisher","award":["W911NF-23-2-0224"],"award-info":[{"award-number":["W911NF-23-2-0224"]}],"id":[{"id":"10.13039\/100000183","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,21]]},"DOI":"10.1145\/3695053.3731043","type":"proceedings-article","created":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T16:43:11Z","timestamp":1750437791000},"page":"990-1004","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Transitive Array: An Efficient GEMM Accelerator with Result Reuse"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4479-5525","authenticated-orcid":false,"given":"Cong","family":"Guo","sequence":"first","affiliation":[{"name":"Duke University, Durham, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8815-7948","authenticated-orcid":false,"given":"Chiyue","family":"Wei","sequence":"additional","affiliation":[{"name":"Duke University, Durham, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4186-6561","authenticated-orcid":false,"given":"Jiaming","family":"Tang","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, Cambridge, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9085-5025","authenticated-orcid":false,"given":"Bowen","family":"Duan","sequence":"additional","affiliation":[{"name":"Duke University, Durham, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4186-7618","authenticated-orcid":false,"given":"Song","family":"Han","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, Cambridge, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3228-6544","authenticated-orcid":false,"given":"Hai","family":"Li","sequence":"additional","affiliation":[{"name":"Duke University, Durham, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1486-8412","authenticated-orcid":false,"given":"Yiran","family":"Chen","sequence":"additional","affiliation":[{"name":"Duke University, Durham, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,6,20]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Jorge Albericio Alberto Delmas Patrick Judd Sayeh Sharify Gerard O\u2019Leary Roman Genov and Andreas Moshovos. 2017. Bit-Pragmatic Deep Neural Network Computing. 50th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO) (2017).","DOI":"10.1145\/3123939.3123982"},{"key":"e_1_3_3_1_3_2","unstructured":"Saleh Ashkboos Amirkeivan Mohtashami Maximilian\u00a0L. Croci Bo Li Pashmina Cameron Martin Jaggi Dan Alistarh Torsten Hoefler and James Hensman. 2024. QuaRot: Outlier-Free 4-Bit Inference in Rotated LLMs. arxiv:https:\/\/arXiv.org\/abs\/2404.00456\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2404.00456"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Kenneth\u00a0E Batcher. 1968. Sorting Networks and Their Applications. Proceedings of the April 30\u2013May 2 1968 Spring Joint Computer Conference (1968) 307\u2013314.","DOI":"10.1145\/1468075.1468121"},{"key":"e_1_3_3_1_5_2","unstructured":"Vaclav\u00a0E. Benes. 1964. Mathematical Theory of Connecting Networks and Telephone Traffic. Mathematics in Science and Engineering 17 (1964)."},{"key":"e_1_3_3_1_6_2","unstructured":"Peter\u00a0F Brown Stephen\u00a0A Della\u00a0Pietra Vincent\u00a0J Della\u00a0Pietra Jennifer\u00a0C Lai and Robert\u00a0L Mercer. 1992. An estimate of an upper bound for the entropy of English. Computational Linguistics 18 1 (1992) 31\u201340."},{"key":"e_1_3_3_1_7_2","unstructured":"Tom\u00a0B Brown. 2020. Language models are few-shot learners. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2005.14165 (2020)."},{"key":"e_1_3_3_1_8_2","unstructured":"Tom\u00a0B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel\u00a0M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. arxiv:https:\/\/arXiv.org\/abs\/2005.14165\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2005.14165"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Yuzong Chen Jian Meng Jae-sun Seo and Mohamed\u00a0S Abdelfattah. 2024. BBS: Bi-directional Bit-level Sparsity for Deep Learning Acceleration. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.05227 (2024).","DOI":"10.1109\/MICRO61859.2024.00048"},{"key":"e_1_3_3_1_10_2","unstructured":"Yuzong Chen Jian Meng Jae sun Seo and Mohamed\u00a0S. Abdelfattah. 2024. BBS: Bi-directional Bit-level Sparsity for Deep Learning Acceleration. arxiv:https:\/\/arXiv.org\/abs\/2409.05227\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2409.05227"},{"key":"e_1_3_3_1_11_2","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cuDNN: Efficient Primitives for Deep Learning. arxiv:https:\/\/arXiv.org\/abs\/1410.0759\u00a0[cs.NE] https:\/\/arxiv.org\/abs\/1410.0759"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Ping Chi Shuangchen Li Cong Xu Tao Zhang Jishen Zhao Yongpan Liu Yu Wang and Yuan Xie. 2016. Prime: A novel processing-in-memory architecture for neural network computation in reram-based main memory. ACM SIGARCH Computer Architecture News 44 3 (2016) 27\u201339.","DOI":"10.1145\/3007787.3001140"},{"key":"e_1_3_3_1_13_2","unstructured":"Wikipedia contributors. [n. d.]. Hasse diagram \u2014 Wikipedia The Free Encyclopedia. https:\/\/en.wikipedia.org\/wiki\/Hasse_diagram"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_3_1_15_2","unstructured":"Jacob Devlin. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1810.04805 (2018)."},{"key":"e_1_3_3_1_16_2","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan Anirudh Goyal Anthony Hartshorn Aobo Yang Archi Mitra Archie Sravankumar Artem Korenev Arthur Hinsvark Arun Rao Aston Zhang Aurelien Rodriguez Austen Gregerson Ava Spataru Baptiste Roziere Bethany Biron Binh Tang Bobbie Chern Charlotte Caucheteux Chaya Nayak Chloe Bi Chris Marra Chris McConnell Christian Keller Christophe Touret Chunyang Wu Corinne Wong Cristian\u00a0Canton Ferrer Cyrus Nikolaidis Damien Allonsius Daniel Song Danielle Pintz Danny Livshits David Esiobu Dhruv Choudhary Dhruv Mahajan Diego Garcia-Olano Diego Perino Dieuwke Hupkes Egor Lakomkin Ehab AlBadawy Elina Lobanova Emily Dinan Eric\u00a0Michael Smith Filip Radenovic Frank Zhang Gabriel Synnaeve Gabrielle Lee Georgia\u00a0Lewis Anderson Graeme Nail Gregoire Mialon Guan Pang Guillem Cucurell Hailey Nguyen Hannah Korevaar Hu Xu Hugo Touvron Iliyan Zarov Imanol\u00a0Arrieta Ibarra Isabel Kloumann Ishan Misra Ivan Evtimov Jade Copet Jaewon Lee Jan Geffert Jana Vranes Jason Park Jay Mahadeokar Jeet Shah Jelmer van\u00a0der Linde Jennifer Billock Jenny Hong Jenya Lee Jeremy Fu Jianfeng Chi Jianyu Huang Jiawen Liu Jie Wang Jiecao Yu Joanna Bitton Joe Spisak Jongsoo Park Joseph Rocca Joshua Johnstun Joshua Saxe Junteng Jia Kalyan\u00a0Vasuden Alwala Kartikeya Upasani Kate Plawiak Ke Li Kenneth Heafield Kevin Stone Khalid El-Arini Krithika Iyer Kshitiz Malik Kuenley Chiu Kunal Bhalla Lauren Rantala-Yeary Laurens van\u00a0der Maaten Lawrence Chen Liang Tan Liz Jenkins Louis Martin Lovish Madaan Lubo Malo Lukas Blecher Lukas Landzaat Luke de Oliveira Madeline Muzzi Mahesh Pasupuleti Mannat Singh Manohar Paluri Marcin Kardas Mathew Oldham Mathieu Rita Maya Pavlova Melanie Kambadur Mike Lewis Min Si Mitesh\u00a0Kumar Singh Mona Hassan Naman Goyal Narjes Torabi Nikolay Bashlykov Nikolay Bogoychev Niladri Chatterji Olivier Duchenne Onur \u00c7elebi Patrick Alrassy Pengchuan Zhang Pengwei Li Petar Vasic Peter Weng Prajjwal Bhargava Pratik Dubal Praveen Krishnan Punit\u00a0Singh Koura Puxin Xu Qing He Qingxiao Dong Ragavan Srinivasan Raj Ganapathy Ramon Calderer Ricardo\u00a0Silveira Cabral Robert Stojnic Roberta Raileanu Rohit Girdhar Rohit Patel Romain Sauvestre Ronnie Polidoro Roshan Sumbaly Ross Taylor Ruan Silva Rui Hou Rui Wang Saghar Hosseini Sahana Chennabasappa Sanjay Singh Sean Bell Seohyun\u00a0Sonia Kim Sergey Edunov Shaoliang Nie Sharan Narang Sharath Raparthy Sheng Shen Shengye Wan Shruti Bhosale Shun Zhang Simon Vandenhende Soumya Batra Spencer Whitman Sten Sootla Stephane Collot Suchin Gururangan Sydney Borodinsky Tamar Herman Tara Fowler Tarek Sheasha Thomas Georgiou Thomas Scialom Tobias Speckbacher Todor Mihaylov Tong Xiao Ujjwal Karn Vedanuj Goswami Vibhor Gupta Vignesh Ramanathan Viktor Kerkez Vincent Gonguet Virginie Do Vish Vogeti Vladan Petrovic Weiwei Chu Wenhan Xiong Wenyin Fu Whitney Meers Xavier Martinet Xiaodong Wang Xiaoqing\u00a0Ellen Tan Xinfeng Xie Xuchao Jia Xuewei Wang Yaelle Goldschlag Yashesh Gaur Yasmine Babaei Yi Wen Yiwen Song Yuchen Zhang Yue Li Yuning Mao Zacharie\u00a0Delpierre Coudert Zheng Yan Zhengxing Chen Zoe Papakipos Aaditya Singh Aaron Grattafiori Abha Jain Adam Kelsey Adam Shajnfeld Adithya Gangidi Adolfo Victoria Ahuva Goldstand Ajay Menon Ajay Sharma Alex Boesenberg Alex Vaughan Alexei Baevski Allie Feinstein Amanda Kallet Amit Sangani Anam Yunus Andrei Lupu Andres Alvarado Andrew Caples Andrew Gu Andrew Ho Andrew Poulton Andrew Ryan Ankit Ramchandani Annie Franco Aparajita Saraf Arkabandhu Chowdhury Ashley Gabriel Ashwin Bharambe Assaf Eisenman Azadeh Yazdan Beau James Ben Maurer Benjamin Leonhardi Bernie Huang Beth Loyd Beto\u00a0De Paola Bhargavi Paranjape Bing Liu Bo Wu Boyu Ni Braden Hancock Bram Wasti Brandon Spence Brani Stojkovic Brian Gamido Britt Montalvo Carl Parker Carly Burton Catalina Mejia Changhan Wang Changkyu Kim Chao Zhou Chester Hu Ching-Hsiang Chu Chris Cai Chris Tindal Christoph Feichtenhofer Damon Civin Dana Beaty Daniel Kreymer Daniel Li Danny Wyatt David Adkins David Xu Davide Testuggine Delia David Devi Parikh Diana Liskovich Didem Foss Dingkang Wang Duc Le Dustin Holland Edward Dowling Eissa Jamil Elaine Montgomery Eleonora Presani Emily Hahn Emily Wood Erik Brinkman Esteban Arcaute Evan Dunbar Evan Smothers Fei Sun Felix Kreuk Feng Tian Firat Ozgenel Francesco Caggioni Francisco Guzm\u00e1n Frank Kanayet Frank Seide Gabriela\u00a0Medina Florez Gabriella Schwarz Gada Badeer Georgia Swee Gil Halpern Govind Thattai Grant Herman Grigory Sizov Guangyi Zhang Guna Lakshminarayanan Hamid Shojanazeri Han Zou Hannah Wang Hanwen Zha Haroun Habeeb Harrison Rudolph Helen Suk Henry Aspegren Hunter Goldman Ibrahim Damlaj Igor Molybog Igor Tufanov Irina-Elena Veliche Itai Gat Jake Weissman James Geboski James Kohli Japhet Asher Jean-Baptiste Gaya Jeff Marcus Jeff Tang Jennifer Chan Jenny Zhen Jeremy Reizenstein Jeremy Teboul Jessica Zhong Jian Jin Jingyi Yang Joe Cummings Jon Carvill Jon Shepard Jonathan McPhie Jonathan Torres Josh Ginsburg Junjie Wang Kai Wu Kam\u00a0Hou U Karan Saxena Karthik Prasad Kartikay Khandelwal Katayoun Zand Kathy Matosich Kaushik Veeraraghavan Kelly Michelena Keqian Li Kun Huang Kunal Chawla Kushal Lakhotia Kyle Huang Lailin Chen Lakshya Garg Lavender A Leandro Silva Lee Bell Lei Zhang Liangpeng Guo Licheng Yu Liron Moshkovich Luca Wehrstedt Madian Khabsa Manav Avalani Manish Bhatt Maria Tsimpoukelli Martynas Mankus Matan Hasson Matthew Lennie Matthias Reso Maxim Groshev Maxim Naumov Maya Lathi Meghan Keneally Michael\u00a0L. Seltzer Michal Valko Michelle Restrepo Mihir Patel Mik Vyatskov Mikayel Samvelyan Mike Clark Mike Macey Mike Wang Miquel\u00a0Jubert Hermoso Mo Metanat Mohammad Rastegari Munish Bansal Nandhini Santhanam Natascha Parks Natasha White Navyata Bawa Nayan Singhal Nick Egebo Nicolas Usunier Nikolay\u00a0Pavlovich Laptev Ning Dong Ning Zhang Norman Cheng Oleg Chernoguz Olivia Hart Omkar Salpekar Ozlem Kalinli Parkin Kent Parth Parekh Paul Saab Pavan Balaji Pedro Rittner Philip Bontrager Pierre Roux Piotr Dollar Polina Zvyagina Prashant Ratanchandani Pritish Yuvraj Qian Liang Rachad Alao Rachel Rodriguez Rafi Ayub Raghotham Murthy Raghu Nayani Rahul Mitra Raymond Li Rebekkah Hogan Robin Battey Rocky Wang Rohan Maheswari Russ Howes Ruty Rinott Sai\u00a0Jayesh Bondu Samyak Datta Sara Chugh Sara Hunt Sargun Dhillon Sasha Sidorov Satadru Pan Saurabh Verma Seiji Yamamoto Sharadh Ramaswamy Shaun Lindsay Shaun Lindsay Sheng Feng Shenghao Lin Shengxin\u00a0Cindy Zha Shiva Shankar Shuqiang Zhang Shuqiang Zhang Sinong Wang Sneha Agarwal Soji Sajuyigbe Soumith Chintala Stephanie Max Stephen Chen Steve Kehoe Steve Satterfield Sudarshan Govindaprasad Sumit Gupta Sungmin Cho Sunny Virk Suraj Subramanian Sy Choudhury Sydney Goldman Tal Remez Tamar Glaser Tamara Best Thilo Kohler Thomas Robinson Tianhe Li Tianjun Zhang Tim Matthews Timothy Chou Tzook Shaked Varun Vontimitta Victoria Ajayi Victoria Montanez Vijai Mohan Vinay\u00a0Satish Kumar Vishal Mangla V\u00edtor Albiero Vlad Ionescu Vlad Poenaru Vlad\u00a0Tiberiu Mihailescu Vladimir Ivanov Wei Li Wenchen Wang Wenwen Jiang Wes Bouaziz Will Constable Xiaocheng Tang Xiaofang Wang Xiaojian Wu Xiaolan Wang Xide Xia Xilun Wu Xinbo Gao Yanjun Chen Ye Hu Ye Jia Ye Qi Yenda Li Yilin Zhang Ying Zhang Yossi Adi Youngjin Nam Yu Wang Yuchen Hao Yundi Qian Yuzi He Zach Rait Zachary DeVito Zef Rosnbrick Zhaoduo Wen Zhenyu Yang and Zhiwei Zhao. 2024. The Llama 3 Herd of Models. arxiv:https:\/\/arXiv.org\/abs\/2407.21783\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Michael\u00a0J. Flynn. 1972. Some Computer Organizations and Their Effectiveness. IEEE Trans. Comput. C-21 (1972) 948\u2013960. https:\/\/api.semanticscholar.org\/CorpusID:18573685","DOI":"10.1109\/TC.1972.5009071"},{"key":"e_1_3_3_1_18_2","unstructured":"Ian Goodfellow. 2016. Deep learning."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Cong Guo Feng Cheng Zhixu Du James Kiessling Jonathan Ku Shiyu Li Ziru Li Mingyuan Ma Tergel Molom-Ochir Benjamin Morris et\u00a0al. 2025. A Survey: Collaborative Hardware and Software Design in the Era of Large Language Models. IEEE Circuits and Systems Magazine 25 1 (2025) 35\u201357.","DOI":"10.1109\/MCAS.2024.3476008"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433722"},{"key":"e_1_3_3_1_21_2","volume-title":"International Conference on Learning Representations","author":"Guo Cong","year":"2022","unstructured":"Cong Guo, Yuxian Qiu, Jingwen Leng, Xiaotian Gao, Chen Zhang, Yunxin Liu, Fan Yang, Yuhao Zhu, and Minyi Guo. 2022. SQuant: On-the-Fly Data-Free Quantization via Diagonal Hessian Approximation. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=JXhROKNZzOc"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589038"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Cong Guo Fengchen Xue Jingwen Leng Yuxian Qiu Yue Guan Weihao Cui Quan Chen and Minyi Guo. 2024. Accelerating sparse dnns based on tiled gemm. IEEE Trans. Comput. (2024).","DOI":"10.1109\/TC.2024.3365942"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00095"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Song Han Xingyu Liu Huizi Mao Jing Pu Ardavan Pedram Mark\u00a0A Horowitz and William\u00a0J Dally. 2016. EIE: Efficient inference engine on compressed deep neural network. ACM SIGARCH Computer Architecture News 44 3 (2016) 243\u2013254.","DOI":"10.1145\/3007787.3001163"},{"key":"e_1_3_3_1_26_2","unstructured":"Song Han Huizi Mao and William\u00a0J Dally. 2015. Deep compression: Compressing deep neural networks with pruning trained quantization and huffman coding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1510.00149 (2015)."},{"key":"e_1_3_3_1_27_2","unstructured":"Helmut Hasse. 1967. Grundlagen der Mathematik in historischer Entwicklung. Mathematisch-Naturwissenschaftliche Bibliothek 31 (1967)."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Kevin Hsieh Eiman Ebrahimi Gwangsun Kim Niladrish Chatterjee Mike O\u2019Connor Nandita Vijaykumar Onur Mutlu and Stephen\u00a0W Keckler. 2016. Transparent offloading and mapping (TOM) enabling programmer-transparent near-data processing in GPU systems. ACM SIGARCH Computer Architecture News 44 3 (2016) 204\u2013216.","DOI":"10.1145\/3007787.3001159"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00086"},{"key":"e_1_3_3_1_31_2","unstructured":"Benoit Jacob Skirmantas Kligys Bo Chen Menglong Zhu Matthew Tang Andrew Howard Hartwig Adam and Dmitry Kalenichenko. 2017. Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference. arxiv:https:\/\/arXiv.org\/abs\/1712.05877\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1712.05877"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"crossref","unstructured":"Dong-Ik Jeon Kyeong-Bin Park and Ki-Seok Chung. 2017. HMC-MAC: Processing-in memory architecture for multiply-accumulate operations with hybrid memory cube. IEEE Computer Architecture Letters 17 1 (2017) 5\u20138.","DOI":"10.1109\/LCA.2017.2700298"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Norman\u00a0P. Jouppi Cliff Young Nishant Patil David Patterson Gaurav Agrawal Raminder Bajwa Sarah Bates Suresh Bhatia Nan Boden Al Borchers Rick Boyle Pierre luc Cantin Clifford Chao Chris Clark Jeremy Coriell Mike Daley Matt Dau Jeffrey Dean Ben Gelb Tara\u00a0Vazir Ghaemmaghami Rajendra Gottipati William Gulland Robert Hagmann C.\u00a0Richard Ho Doug Hogberg John Hu Robert Hundt Dan Hurt Julian Ibarz Aaron Jaffey Alek Jaworski Alexander Kaplan Harshit Khaitan Andy Koch Naveen Kumar Steve Lacy James Laudon James Law Diemthu Le Chris Leary Zhuyuan Liu Kyle Lucke Alan Lundin Gordon MacKean Adriana Maggiore Maire Mahony Kieran Miller Rahul Nagarajan Ravi Narayanaswami Ray Ni Kathy Nix Thomas Norrie Mark Omernick Narayana Penukonda Andy Phelps Jonathan Ross Matt Ross Amir Salek Emad Samadiani Chris Severn Gregory Sizikov Matthew Snelham Jed Souter Dan Steinberg Andy Swing Mercedes Tan Gregory Thorson Bo Tian Horia Toma Erick Tuttle Vijay Vasudevan Richard Walter Walter Wang Eric Wilcox and Doe\u00a0Hyun Yoon. 2017. In-Datacenter Performance Analysis of a Tensor Processing Unit. arxiv:https:\/\/arXiv.org\/abs\/1704.04760\u00a0[cs.AR] https:\/\/arxiv.org\/abs\/1704.04760","DOI":"10.1145\/3140659.3080246"},{"key":"e_1_3_3_1_34_2","unstructured":"Jared Kaplan Sam McCandlish Tom Henighan Tom\u00a0B. Brown Benjamin Chess Rewon Child Scott Gray Alec Radford Jeffrey Wu and Dario Amodei. 2020. Scaling Laws for Neural Language Models. arxiv:https:\/\/arXiv.org\/abs\/2001.08361\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2001.08361"},{"key":"e_1_3_3_1_35_2","volume-title":"The Art of Computer Programming","author":"Knuth Donald\u00a0E.","year":"2009","unstructured":"Donald\u00a0E. Knuth. 2009. The Art of Computer Programming, Volume 4A: Combinatorial Algorithms, Part 1. In The Art of Computer Programming. Addison-Wesley Professional."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Yann LeCun Yoshua Bengio and Geoffrey Hinton. 2015. Deep learning. nature 521 7553 (2015) 436\u2013444.","DOI":"10.1038\/nature14539"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"Jungi Lee Wonbeom Lee and Jaewoong Sim. 2024. Tender: Accelerating Large Language Models via Tensor Decomposition and Runtime Requantization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.12930 (2024).","DOI":"10.1109\/ISCA59077.2024.00080"},{"key":"e_1_3_3_1_38_2","unstructured":"Yuhang Li Mingzhu Shen Jian Ma Yan Ren Mingxin Zhao Qi Zhang Ruihao Gong Fengwei Yu and Junjie Yan. 2021. Mqbench: Towards reproducible and deployable model quantization benchmark. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2111.03759 (2021)."},{"key":"e_1_3_3_1_39_2","unstructured":"Ji Lin Jiaming Tang Haotian Tang Shang Yang Wei-Ming Chen Wei-Chen Wang Guangxuan Xiao Xingyu Dang Chuang Gan and Song Han. 2024. AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. arxiv:https:\/\/arXiv.org\/abs\/2306.00978\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2306.00978"},{"key":"e_1_3_3_1_40_2","unstructured":"Yujun Lin Haotian Tang Shang Yang Zhekai Zhang Guangxuan Xiao Chuang Gan and Song Han. 2024. QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving. arxiv:https:\/\/arXiv.org\/abs\/2405.04532\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2405.04532"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00112"},{"key":"e_1_3_3_1_42_2","unstructured":"Zechun Liu Changsheng Zhao Igor Fedorov Bilge Soran Dhruv Choudhary Raghuraman Krishnamoorthi Vikas Chandra Yuandong Tian and Tijmen Blankevoort. 2024. SpinQuant: LLM quantization with learned rotations. arxiv:https:\/\/arXiv.org\/abs\/2405.16406\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2405.16406"},{"key":"e_1_3_3_1_43_2","unstructured":"Hang Lu Liang Chang Chenglong Li Zixuan Zhu Shengjian Lu Yanhuan Liu and Mingzhe Zhang. 2021. Distilling Bit-level Sparsity Parallelism for General Purpose Deep Learning Acceleration. 54th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO) (2021)."},{"key":"e_1_3_3_1_44_2","unstructured":"Stephen Merity Caiming Xiong James Bradbury and Richard Socher. 2016. Pointer sentinel mixture models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1609.07843 (2016)."},{"key":"e_1_3_3_1_45_2","volume-title":"NVIDIA A100 Tensor Core GPU Architecture","author":"Corporation NVIDIA","year":"2020","unstructured":"NVIDIA Corporation. 2020. NVIDIA A100 Tensor Core GPU Architecture. Technical Report. NVIDIA Corporation. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf Accessed: 2024-11-23."},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00063"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"e_1_3_3_1_48_2","unstructured":"Wenqi Shao Mengzhao Chen Zhaoyang Zhang Peng Xu Lirui Zhao Zhiqian Li Kaipeng Zhang Peng Gao Yu Qiao and Ping Luo. 2024. OmniQuant: Omnidirectionally Calibrated Quantization for Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2308.13137\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2308.13137"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00069"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"crossref","unstructured":"Man Shi Vikram Jain Antony Joseph Maurice Meijer and Marian Verhelst. 2024. BitWave: Exploiting Column-Based Bit-Level Sparsity for Deep Learning Acceleration. Proceedings of the 30th IEEE International Symposium on High-Performance Computer Architecture (HPCA) (2024).","DOI":"10.1109\/HPCA57654.2024.00062"},{"key":"e_1_3_3_1_51_2","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arxiv:https:\/\/arXiv.org\/abs\/1909.08053\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_3_1_52_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_1_53_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et\u00a0al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09288 (2023)."},{"key":"e_1_3_3_1_54_2","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_3_1_55_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N. Gomez Lukasz Kaiser and Illia Polosukhin. 2023. Attention Is All You Need. arxiv:https:\/\/arXiv.org\/abs\/1706.03762\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1706.03762"},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/321439.321449"},{"key":"e_1_3_3_1_57_2","unstructured":"Zhongwei Wan Xin Wang Che Liu Samiul Alam Yu Zheng Jiachen Liu Zhongnan Qu Shen Yan Yi Zhu Quanlu Zhang Mosharaf Chowdhury and Mi Zhang. 2024. Efficient Large Language Models: A Survey. arxiv:https:\/\/arXiv.org\/abs\/2312.03863\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2312.03863"},{"key":"e_1_3_3_1_58_2","unstructured":"Hongyu Wang Shuming Ma Li Dong Shaohan Huang Huaijie Wang Lingxiao Ma Fan Yang Ruiping Wang Yi Wu and Furu Wei. 2023. BitNet: Scaling 1-bit Transformers for Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2310.11453\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2310.11453"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00088"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00066"},{"key":"e_1_3_3_1_61_2","unstructured":"Wei Wen Chunpeng Wu Yandan Wang Yiran Chen and Hai Li. 2016. Learning structured sparsity in deep neural networks. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_3_1_62_2","unstructured":"Guangxuan Xiao Ji Lin Mickael Seznec Hao Wu Julien Demouth and Song Han. 2024. SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2211.10438\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2211.10438"},{"key":"e_1_3_3_1_63_2","first-page":"27168","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Yao Zhewei","year":"2022","unstructured":"Zhewei Yao, Reza Yazdani\u00a0Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, and Yuxiong He. 2022. ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. In Advances in Neural Information Processing Systems, S.\u00a0Koyejo, S.\u00a0Mohamed, A.\u00a0Agarwal, D.\u00a0Belgrave, K.\u00a0Cho, and A.\u00a0Oh (Eds.), Vol.\u00a035. Curran Associates, Inc., 27168\u201327183. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/adf7fa39d65e2983d724ff7da57f00ac-Paper-Conference.pdf"},{"key":"e_1_3_3_1_64_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00071"},{"key":"e_1_3_3_1_65_2","doi-asserted-by":"crossref","unstructured":"Chen Zhang Yang Wang Zhiqiang Xie Cong Guo Yunxin Liu Jingwen Leng Guangyu Sun Zhigang Ji Runsheng Wang Yuan Xie et\u00a0al. 2024. DSTC: Dual-Side Sparsity Tensor Core for DNNs Acceleration on Modern GPU Architectures. IEEE Trans. Comput. (2024).","DOI":"10.1109\/TC.2024.3475814"},{"key":"e_1_3_3_1_66_2","unstructured":"Zhenyu Zhang Ying Sheng Tianyi Zhou Tianlong Chen Lianmin Zheng Ruisi Cai Zhao Song Yuandong Tian Christopher R\u00e9 Clark Barrett Zhangyang Wang and Beidi Chen. [n. d.]. H$_2$O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2306.14048 [cs]http:\/\/arxiv.org\/abs\/2306.14048"},{"key":"e_1_3_3_1_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358269"}],"event":{"name":"ISCA '25: Proceedings of the 52nd Annual International Symposium on Computer Architecture","location":"Tokyo Japan","acronym":"SIGARCH '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 52nd Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731043","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731043","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T10:58:19Z","timestamp":1750503499000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695053.3731043"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,20]]},"references-count":66,"alternative-id":["10.1145\/3695053.3731043","10.1145\/3695053"],"URL":"https:\/\/doi.org\/10.1145\/3695053.3731043","relation":{},"subject":[],"published":{"date-parts":[[2025,6,20]]},"assertion":[{"value":"2025-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}