{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T15:31:23Z","timestamp":1773588683033,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":83,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,22]]},"DOI":"10.1145\/3779212.3790189","type":"proceedings-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T13:55:26Z","timestamp":1773150926000},"page":"1216-1234","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Mugi: Value Level Parallelism For Efficient LLMs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-0571-7413","authenticated-orcid":false,"given":"Daniel","family":"Price","sequence":"first","affiliation":[{"name":"Department of ECE, University of Central Florida, Orlando, Florida, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7750-8725","authenticated-orcid":false,"given":"Prabhu","family":"Vellaisamy","sequence":"additional","affiliation":[{"name":"Department of ECE, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7225-0629","authenticated-orcid":false,"given":"John Paul","family":"Shen","sequence":"additional","affiliation":[{"name":"Department of ECE, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9775-8026","authenticated-orcid":false,"given":"Di","family":"Wu","sequence":"additional","affiliation":[{"name":"Department of ECE, University of Central Florida, Orlando, Florida, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,3,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. USENIX Symposium on Operating Systems Design and Implementation","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. USENIX Symposium on Operating Systems Design and Implementation (2024)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ARITH.2019.00023"},{"key":"e_1_3_2_1_3_1","volume-title":"GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints. Empirical Methods in Natural Language Processing.","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebron, and Sumit Sanghai. 2023. GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints. Empirical Methods in Natural Language Processing."},{"key":"e_1_3_2_1_4_1","unstructured":"Shuai Bai Yuxuan Cai Ruizhe Chen Keqin Chen Xionghui Chen Zesen Cheng Lianghao Deng Wei Ding Chang Gao Chunjiang Ge Wenbin Ge Zhifang Guo Qidong Huang Jie Huang Fei Huang Binyuan Hui Shutong Jiang Zhaohai Li Mingsheng Li Mei Li Kaixin Li Zicheng Lin Junyang Lin Xuejing Liu Jiawei Liu Chenglong Liu Yang Liu Dayiheng Liu Shixuan Liu Dunjie Lu Ruilin Luo Chenxu Lv Rui Men Lingchen Meng Xuancheng Ren Xingzhang Ren Sibo Song Yuchong Sun Jun Tang Jianhong Tu Jianqiang Wan Peng Wang Pengfei Wang Qiuyue Wang Yuxuan Wang Tianbao Xie Yiheng Xu Haiyang Xu Jin Xu Zhibo Yang Mingkun Yang Jianxin Yang An Yang Bowen Yu Fei Zhang Hang Zhang Xi Zhang Bo Zheng Humen Zhong Jingren Zhou Fan Zhou Jing Zhou Yuanzhi Zhu and Ke Zhu. 2025. Qwen3-VL Technical Report. arXiv (2025)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3085572"},{"key":"e_1_3_2_1_6_1","unstructured":"Jeff Barr. 2019. Amazon EC2 Update \u2013 Inf1 Instances with AWS Inferentia Chips for High Performance Cost-Effective Inferencing. https:\/\/aws.amazon.com\/blogs\/aws\/amazon-ec2-update-inf1-instances-with-aws-inferentia-chips-for-high-performance-cost-effective-inferencing\/"},{"key":"e_1_3_2_1_7_1","volume-title":"Jones","author":"Brunvand Erik","year":"2018","unstructured":"Erik Brunvand, Donald Kline, and Alex K. Jones. 2018. Dark Silicon Considered Harmful: A Case for Truly Green Computing. In nternational Green and Sustainable Computing Conference (IGSC)."},{"key":"e_1_3_2_1_8_1","volume-title":"Towards Fine-Tunable Quantized Large Language Models with Error Correction through Low-Rank Adaptation. arXiv","author":"Chai Yuji","year":"2023","unstructured":"Yuji Chai, John Gkountouras, Glenn G. Ko, David Brooks, and Gu-Yeon Wei. 2023. INT2.1: Towards Fine-Tunable Quantized Large Language Models with Error Correction through Low-Rank Adaptation. arXiv (2023)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC55918.2022.00018"},{"key":"e_1_3_2_1_10_1","volume-title":"Lazy Batching: An SLA-aware Batching System for Cloud Machine Learning Inference. In International Symposium on High-Performance Computer Architecture.","author":"Choi Yujeong","year":"2021","unstructured":"Yujeong Choi, Yunseong Kim, and Minsoo Rhu. 2021. Lazy Batching: An SLA-aware Batching System for Cloud Machine Learning Inference. In International Symposium on High-Performance Computer Architecture."},{"key":"e_1_3_2_1_11_1","volume-title":"DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models. arXiv","author":"Dai Damai","year":"2024","unstructured":"Damai Dai, Chengqi Deng, Chenggang Zhao, R. X. Xu, Huazuo Gao, Deli Chen, Jiashi Li, Wangding Zeng, Xingkai Yu, Y. Wu, Zhenda Xie, Y. K. Li, Panpan Huang, Fuli Luo, Chong Ruan, Zhifang Sui, and Wenfeng Liang. 2024. DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models. arXiv (2024)."},{"key":"e_1_3_2_1_12_1","unstructured":"DeepSeek-AI Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu et al. 2025. Deepseek-V3 Technical Report. arXiv (2025)."},{"key":"e_1_3_2_1_13_1","unstructured":"Tim Dettmers Mike Lewis Younes Belkada and Luke Zettlemoyer. 2022. LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning. JMLR.org.","author":"Driess Danny","year":"2023","unstructured":"Danny Driess, Fei Xia, Mehdi S. M. Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, Wenlong Huang, Yevgen Chebotar, Pierre Sermanet, Daniel Duckworth, Sergey Levine, Vincent Vanhoucke, Karol Hausman, Marc Toussaint, Klaus Greff, Andy Zeng, Igor Mordatch, and Pete Florence. 2023. PaLM-E: an embodied multimodal language model. In Proceedings of the 40th International Conference on Machine Learning. JMLR.org."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 39th International Conference on Machine Learning.","author":"Du Nan","year":"2022","unstructured":"Nan Du, Yanping Huang, Andrew M Dai, Simon Tong, Dmitry Lepikhin, Yuanzhong Xu, Maxim Krikun, Yanqi Zhou, Adams Wei Yu, Orhan Firat, Barret Zoph, Liam Fedus, Maarten P Bosma, Zongwei Zhou, Tao Wang, Emma Wang, Kellie Webster, Marie Pellat, Kevin Robinson, Kathleen Meier-Hellstern, Toju Duke, Lucas Dixon, Kun Zhang, Quoc Le, Yonghui Wu, Zhifeng Chen, and Claire Cui. 2022. GLaM: Efficient Scaling of Language Models with Mixture-of-Experts. In Proceedings of the 39th International Conference on Machine Learning."},{"key":"e_1_3_2_1_16_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle et al. 2024. The Llama 3 Herd of Models. arXiv (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2017.12.012"},{"key":"e_1_3_2_1_18_1","volume-title":"LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language Models. arXiv","author":"Faiz Ahmad","year":"2024","unstructured":"Ahmad Faiz, Sotaro Kaneda, Ruhan Wang, Rita Osi, Prateek Sharma, Fan Chen, and Lei Jiang. 2024. LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language Models. arXiv (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"GPTQ: Accurate Post-Training Quantization for Generative Pre-Trained Transformers. arXiv","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2023. GPTQ: Accurate Post-Training Quantization for Generative Pre-Trained Transformers. arXiv (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSSC.1969.300225"},{"key":"e_1_3_2_1_21_1","volume-title":"Cambricon-U: A Systolic Random Increment Memory Architecture for Unary Computing. In International Symposium on Microarchitecture.","author":"Guo Hongrui","year":"2023","unstructured":"Hongrui Guo, Yongwei Zhao, Zhangmai Li, Yifan Hao, Chang Liu, Xinkai Song, Xiaqing Li, Zidong Du, Rui Zhang, Qi Guo, Tianshi Chen, and Zhiwei Xu. 2023. Cambricon-U: A Systolic Random Increment Memory Architecture for Unary Computing. In International Symposium on Microarchitecture."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527408"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Udit Gupta Mariam Elgamal Gage Hills Gu-Yeon Wei Hsien-Hsin S. Lee David Brooks and Carole-Jean Wu. 2022b. ACT: designing sustainable computer systems with an architectural carbon modeling tool.","DOI":"10.1145\/3470496.3527408"},{"key":"e_1_3_2_1_24_1","volume-title":"International Symposium on Computer Architecture.","author":"Ham Tae Jun","unstructured":"Tae Jun Ham, Yejin Lee, Seong Hoon Seo, Soosung Kim, Hyunji Choi, Sung Jun Jung, and Jae W. Lee. 2021. ELSA: hardware-software co-design for efficient, lightweight self-attention mechanism in neural networks. In International Symposium on Computer Architecture."},{"key":"e_1_3_2_1_25_1","volume-title":"Gaussian Error Linear Units (GELUs). arXiv","author":"Hendrycks Dan","year":"2016","unstructured":"Dan Hendrycks and Kevin Gimpel. 2016. Gaussian Error Linear Units (GELUs). arXiv (2016)."},{"key":"e_1_3_2_1_26_1","volume-title":"Kurt Keutzer, and Amir Gholami.","author":"Hooper Coleman","year":"2024","unstructured":"Coleman Hooper, Sehoon Kim, Hiva Mohammadzadeh, Michael W. Mahoney, Yakun Sophia Shao, Kurt Keutzer, and Amir Gholami. 2024. KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache Quantization. arXiv (2024)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"key":"e_1_3_2_1_28_1","volume-title":"BiLLM: Pushing the Limit of Post-Training Quantization for LLMs. arXiv","author":"Huang Wei","year":"2024","unstructured":"Wei Huang, Yangdong Liu, Haotong Qin, Ying Li, Shiming Zhang, Xianglong Liu, Michele Magno, and Xiaojuan Qi. 2024. BiLLM: Pushing the Limit of Post-Training Quantization for LLMs. arXiv (2024)."},{"key":"e_1_3_2_1_29_1","unstructured":"International Telecommunication Union (ITU) and World Benchmarking Alliance (WBA). 2025. Tech sector emissions energy use grow with rise of AI. https:\/\/www.itu.int\/en\/mediacentre\/Pages\/PR-2025-06-05-greening-digital-companies-report.aspx Accessed: 2025-08-12."},{"key":"e_1_3_2_1_30_1","volume-title":"FIGNA: Integer Unit-Based Accelerator Design for FP-INT GEMM Preserving Numerical Accuracy. In 2024 IEEE International Symposium on High-Performance Computer Architecture (HPCA).","author":"Jang Jaeyong","year":"2024","unstructured":"Jaeyong Jang, Yulhwa Kim, Juheun Lee, and Jae-Joon Kim. 2024. FIGNA: Integer Unit-Based Accelerator Design for FP-INT GEMM Preserving Numerical Accuracy. In 2024 IEEE International Symposium on High-Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_31_1","unstructured":"Yangqing Jia. 2014. Learning Semantic Image Representations at a Large Scale. Ph.D. Dissertation. EECS Department University of California Berkeley. http:\/\/www2.eecs.berkeley.edu\/Pubs\/TechRpts\/2014\/EECS-2014-93.html"},{"key":"e_1_3_2_1_32_1","unstructured":"Dhiraj Kalamkar Dheevatsa Mudigere Naveen Mellempudi Dipankar Das Kunal Banerjee Sasikanth Avancha Dharma Teja Vooturi Nataraj Jammalamadaka Jianyu Huang Hector Yuen Jiyan Yang Jongsoo Park Alexander Heinecke Evangelos Georganas Sudarshan Srinivasan Abhisek Kundu Misha Smelyanskiy Bharat Kaul and Pradeep Dubey. 2019. A Study of BFLOAT16 For Deep Learning Training. arXiv (2019)."},{"key":"e_1_3_2_1_33_1","volume-title":"GEAR: An Efficient KV Cache Compression Recipe for Near-Lossless Generative Inference of LLM. arXiv","author":"Qingru Zhang Kang Tushar","year":"2024","unstructured":"Tushar and Qingru Zhang Kang, Hao, Souvik Kundu, Geonhwa Jeong, Zaoxing Liu, Tushar Krishna, and Tuo Zhao. 2024. GEAR: An Efficient KV Cache Compression Recipe for Near-Lossless Generative Inference of LLM. arXiv (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"NonGEMM Bench: Understanding the Performance Horizon of the Latest ML Workloads with NonGEMM Workloads. In International Symposium on Performance Analysis of Systems and Software.","author":"Karami Rachid","year":"2025","unstructured":"Rachid Karami, Sheng-Chun Kao, and Hyoukjun Kwon. 2025. NonGEMM Bench: Understanding the Performance Horizon of the Latest ML Workloads with NonGEMM Workloads. In International Symposium on Performance Analysis of Systems and Software."},{"key":"e_1_3_2_1_35_1","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. ImageNet Classification with Deep Convolutional Neural Networks. In Advances in Neural Information Processing Systems F. Pereira C.J. Burges L. Bottou and K.Q. Weinberger (Eds.)."},{"key":"e_1_3_2_1_36_1","volume-title":"Yuwei Ren, Markus Nagel, Jorn Peters, and Tijmen Blankevoort.","author":"Kuzmin Andrey","year":"2022","unstructured":"Andrey Kuzmin, Mart Van Baalen, Yuwei Ren, Markus Nagel, Jorn Peters, and Tijmen Blankevoort. 2022. FP8 Quantization: The Power of the Exponent. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_37_1","volume-title":"GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. arXiv","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2020. GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. arXiv (2020)."},{"key":"e_1_3_2_1_38_1","volume-title":"AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration. GetMobile: Mobile Comp. and Comm.","author":"Lin Ji","year":"2025","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Guangxuan Xiao, and Song Han. 2025. AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration. GetMobile: Mobile Comp. and Comm. (2025)."},{"key":"e_1_3_2_1_39_1","volume-title":"The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits. arXiv","author":"Ma Shuming","year":"2024","unstructured":"Shuming Ma, Hongyu Wang, Lingxiao Ma, Lei Wang, Wenhui Wang, Shaohan Huang, Li Dong, Ruiping Wang, Jilong Xue, and Furu Wei. 2024. The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits. arXiv (2024)."},{"key":"e_1_3_2_1_40_1","unstructured":"Meta AI. 2025. Llama 4: Multimodal Intelligence. https:\/\/ai.meta.com\/blog\/llama-4-multimodal-intelligence\/. Accessed: 2025-12-05."},{"key":"e_1_3_2_1_41_1","volume-title":"FP8 Formats for Deep Learning. arXiv","author":"Micikevicius Paulius","year":"2022","unstructured":"Paulius Micikevicius, Dusan Stosic, Neil Burgess, Marius Cornea, Pradeep Dubey, Richard Grisenthwaite, Sangwon Ha, Alexander Heinecke, Patrick Judd, John Kamalu, Naveen Mellempudi, Stuart Oberman, Mohammad Shoeybi, Michael Siu, and Hao Wu. 2022. FP8 Formats for Deep Learning. arXiv (2022)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/NORCHIP.2014.7004740"},{"key":"e_1_3_2_1_43_1","first-page":"11","volume":"202","author":"Tensor NVIDIA.","unstructured":"NVIDIA. 2024a. NVIDIA H100 Tensor Core GPU Architecture. https:\/\/resources.nvidia.com\/en-us-hopper-architecture\/nvidia-h100-tensor-c Retrieved 2024-11-14 from","journal-title":"Retrieved"},{"key":"e_1_3_2_1_44_1","first-page":"11","volume":"202","author":"NVIDIA.","unstructured":"NVIDIA. 2024b. TensorRT-LLM. https:\/\/github.com\/NVIDIA\/TensorRT-LLM Retrieved 2024-11-14 from","journal-title":"LLM Retrieved"},{"key":"e_1_3_2_1_45_1","volume":"199","author":"Oberman Stuart F.","unstructured":"Stuart F. Oberman and Michael J. Flynn. 1997. Division Algorithms and Implementations. IEEEXplore (1997).","journal-title":"Michael J. Flynn."},{"key":"e_1_3_2_1_46_1","volume-title":"Carat: Unlocking Value-Level Parallelism for Multiplier-Free GEMMs. In International Conference on Architectural Support for Programming Languages and Operating Systems.","author":"Pan Zhewen","year":"2024","unstructured":"Zhewen Pan, Joshua San Miguel, and Di Wu. 2024. Carat: Unlocking Value-Level Parallelism for Multiplier-Free GEMMs. In International Conference on Architectural Support for Programming Languages and Operating Systems."},{"key":"e_1_3_2_1_47_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreass K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. arXiv (2019)."},{"key":"e_1_3_2_1_48_1","volume-title":"Carbon Emissions and Large Neural Network Training. arXiv","author":"Patterson David","year":"2021","unstructured":"David Patterson, Joseph Gonzalez, Quoc Le, Chen Liang, Lluis-Miquel Munguia, Daniel Rothchild, David So, Maud Texier, and Jeff Dean. 2021. Carbon Emissions and Large Neural Network Training. arXiv (2021)."},{"key":"e_1_3_2_1_49_1","unstructured":"Raspberry Pi. 2024. Raspberry Pi 5. https:\/\/www.raspberrypi.com\/products\/raspberry-pi-5\/ Retrieved 2024-11-14 from"},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of Machine Learning and Systems.","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2023. Efficiently Scaling Transformer Inference. In Proceedings of Machine Learning and Systems."},{"key":"e_1_3_2_1_51_1","volume-title":"Representation Range Needs for 16-Bit Neural Network Training. arXiv","author":"Popescu Valentina","year":"2021","unstructured":"Valentina Popescu, Abhinav Venigalla, Di Wu, and Robert Schreiber. 2021. Representation Range Needs for 16-Bit Neural Network Training. arXiv (2021)."},{"key":"e_1_3_2_1_52_1","volume-title":"FACT: FFN-Attention Co-optimized Transformer Architecture with Eager Correlation Prediction. In International Symposium on Computer Architecture.","author":"Qin Yubin","year":"2023","unstructured":"Yubin Qin, Yang Wang, Dazheng Deng, Zhiren Zhao, Xiaolong Yang, Leibo Liu, Shaojun Wei, Yang Hu, and Shouyi Yin. 2023. FACT: FFN-Attention Co-optimized Transformer Architecture with Eager Correlation Prediction. In International Symposium on Computer Architecture."},{"key":"e_1_3_2_1_53_1","volume-title":"MECLA: Memory-Compute-Efficient LLM Accelerator with Scaling Sub-matrix Partition. In International Symposium on Computer Architecture.","author":"Qin Yubin","year":"2025","unstructured":"Yubin Qin, Yang Wang, Zhiren Zhao, Xiaolong Yang, Yang Zhou, Shaojun Wei, Yang Hu, and Shouyi Yin. 2025. MECLA: Memory-Compute-Efficient LLM Accelerator with Scaling Sub-matrix Partition. In International Symposium on Computer Architecture."},{"key":"e_1_3_2_1_54_1","volume-title":"International Conference on Machine Learning.","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust Speech Recognition via Large-Scale Weak Supervision. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Mariam Rakka Jinhao Li Guohao Dai Ahmed Eltawil Mohammed E Fouda and Fadi Kurdahi. 2025. SoftmAP: Software-Hardware Co-design for Integer-Only Softmax on Associative Processors. In Design Automation and Test in Europe.","DOI":"10.23919\/DATE64628.2025.10992862"},{"key":"e_1_3_2_1_56_1","volume-title":"Le","author":"Ramachandran Prajit","year":"2017","unstructured":"Prajit Ramachandran, Barret Zoph, and Quoc V. Le. 2017. Searching for Activation Functions. arXiv (2017)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00016"},{"key":"e_1_3_2_1_58_1","volume-title":"Simba: Scaling Deep-Learning Inference with Multi-Chip-Module-Based Architecture. In International Symposium on Microarchitecture. 14\u201327","author":"Shao Yakun Sophia","unstructured":"Yakun Sophia Shao, Jason Clemons, Rangharajan Venkatesan, Brian Zimmer, Matthew Fojtik, Nan Jiang, Ben Keller, Alicia Klinefelter, Nathaniel Pinckney, Priyanka Raina, Stephen G. Tell, Yanqing Zhang, William J. Dally, Joel Emer, C. Thomas Gray, Brucek Khailany, and Stephen W. Keckler. 2019. Simba: Scaling Deep-Learning Inference with Multi-Chip-Module-Based Architecture. In International Symposium on Microarchitecture. 14\u201327."},{"key":"e_1_3_2_1_59_1","volume-title":"Nexus: A GPU Cluster Engine for Accelerating DNN-Based Video Analysis. In Symposium on Operating Systems Principles.","author":"Shen Haichen","year":"2019","unstructured":"Haichen Shen, Lequn Chen, Yuchen Jin, Liangyu Zhao, Bingyu Kong, Matthai Philipose, Arvind Krishnamurthy, and Ravi Sundaram. 2019. Nexus: A GPU Cluster Engine for Accelerating DNN-Based Video Analysis. In Symposium on Operating Systems Principles."},{"key":"e_1_3_2_1_60_1","volume-title":"International Conference on Machine Learning.","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586134"},{"key":"e_1_3_2_1_62_1","volume-title":"Gemini: A Family of Highly Capable Multimodal Models.","author":"Team Gemini","year":"2025","unstructured":"Gemini Team, Rohan Anil, Sebastian Borgeaud, Jean-Baptiste Alayrac, Jiahui Yu, Radu Soricut, et al., 2025. Gemini: A Family of Highly Capable Multimodal Models."},{"key":"e_1_3_2_1_63_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswamim Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. LLAMA 2: Open Foundation and Fine-Tuned Chat Models. arXiv (2023)."},{"key":"e_1_3_2_1_64_1","volume-title":"\u0141 ukasz Kaiser, and Illia Polosukhin","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All You Need. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_65_1","volume-title":"MCBP: A Memory-Compute Efficient LLM Inference Accelerator Leveraging Bit-Slice-enabled Sparsity and Repetitiveness. In International Symposium on Microarchitecture.","author":"Wang Huizheng","year":"2025","unstructured":"Huizheng Wang, Zichuan Wang, Zhiheng Yue, Yousheng Long, Taiquan Wei, Jianxun Yang, Yang Wang, Chao Li, Shaojun Wei, Yang Hu, and Shouyi Yin. 2025. MCBP: A Memory-Compute Efficient LLM Inference Accelerator Leveraging Bit-Slice-enabled Sparsity and Repetitiveness. In International Symposium on Microarchitecture."},{"key":"e_1_3_2_1_66_1","volume-title":"SpAtten: Efficient Sparse Attention Architecture with Cascade Token and Head Pruning. International Symposium on High-Performance Computer Architecture","author":"Wang Hanrui","year":"2021","unstructured":"Hanrui Wang, Zhekai Zhang, and Song Han. 2021. SpAtten: Efficient Sparse Attention Architecture with Cascade Token and Head Pruning. International Symposium on High-Performance Computer Architecture (2021)."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3174243.3174253"},{"key":"e_1_3_2_1_68_1","volume-title":"ResearchGate","author":"Wielgosz Maciej","year":"2009","unstructured":"Maciej Wielgosz and Ernest Jamro. 2009. Highly Efficient Twin Module Structure of 64-Bit Exponential Function Implemented on SGI RASC Platform., ResearchGate (2009)."},{"key":"e_1_3_2_1_69_1","volume-title":"HuggingFace's Transformers: State-of-the-art Natural Language Processing. arXiv","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, and Jamie Brew. 2020. HuggingFace's Transformers: State-of-the-art Natural Language Processing. arXiv (2020)."},{"key":"e_1_3_2_1_70_1","unstructured":"Carole-Jean Wu Ramya Raghavendra Udit Gupta Bilge Acun Newsha Ardalani Kiwan Maeng Gloria Chang Fiona Aga Behram James Huang Charles Bai Michael Gschwind Anurag Gupta Myle Ott Anastasia Melnikov Salvatore Candido David Brooks Geeta Chauhan Benjamin Lee Hsien-Hsin S. Lee Bugra Akyildiz Maximilian Balandat Joe Spisak Ravi Jain Mike Rabbat and Kim Hazelwood. 2022b. Sustainable AI: Environmental Implications Challenges and Opportunities. arXiv (2022)."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISLPED.2019.8824959"},{"key":"e_1_3_2_1_72_1","volume-title":"UNO: Virtualizing and Unifying Nonlinear Operations for Emerging Neural Networks. In International Symposium on Low Power Electronics and Design.","author":"Wu Di","year":"2021","unstructured":"Di Wu, Jingjie Li, Setareh Behrooz, Younghyun Kim, and Joshua San Miguel. 2021a. UNO: Virtualizing and Unifying Nonlinear Operations for Emerging Neural Networks. In International Symposium on Low Power Electronics and Design."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527401"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00040"},{"key":"e_1_3_2_1_75_1","volume-title":"uGEMM: Unary Computing for GEMM Applications","author":"Wu Di","year":"2021","unstructured":"Di Wu, Jingjie Li, Ruokai Yin, Hsuan Hsiao, Younghyun Kim, and Joshua San Miguel. 2021b. uGEMM: Unary Computing for GEMM Applications. IEEE Micro (2021)."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3316781.3317844"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD53106.2021.00014"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00010"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/MDAT.2021.3050716"},{"key":"e_1_3_2_1_80_1","volume-title":"Normalized Stability: A Cross-Level Design Metric for Early Termination in Stochastic Computing. In Asia and South Pacific Design Automation Conference.","author":"Wu Di","year":"2021","unstructured":"Di Wu, Ruokai Yin, and Joshua San Miguel. 2021 d. Normalized Stability: A Cross-Level Design Metric for Early Termination in Stochastic Computing. In Asia and South Pacific Design Automation Conference."},{"key":"e_1_3_2_1_81_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In USENIX Symposium on Operating Systems Design and Implementation.","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In USENIX Symposium on Operating Systems Design and Implementation."},{"key":"e_1_3_2_1_82_1","volume-title":"KV Cache is 1 Bit Per Channel: Efficient Large Language Model Inference with Coupled Quantization. arXiv","author":"Zhang Tianyi","year":"2024","unstructured":"Tianyi Zhang, Jonah Yi, Zhaozhuo Xu, and Anshumali Shrivastava. 2024. KV Cache is 1 Bit Per Channel: Efficient Large Language Model Inference with Coupled Quantization. arXiv (2024)."},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00077"}],"event":{"name":"ASPLOS '26: 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Pittsburgh PA USA","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"deposited":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T14:06:12Z","timestamp":1773583572000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3779212.3790189"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,22]]},"references-count":83,"alternative-id":["10.1145\/3779212.3790189","10.1145\/3779212"],"URL":"https:\/\/doi.org\/10.1145\/3779212.3790189","relation":{},"subject":[],"published":{"date-parts":[[2026,3,22]]},"assertion":[{"value":"2026-03-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}