{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T17:01:59Z","timestamp":1773939719927,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T00:00:00Z","timestamp":1740700800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,2,28]]},"DOI":"10.1145\/3710848.3710871","type":"proceedings-article","created":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T06:20:57Z","timestamp":1740723657000},"page":"239-251","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-8073-8845","authenticated-orcid":false,"given":"Elias","family":"Frantar","sequence":"first","affiliation":[{"name":"ISTA, Klosterneuburg, Austria"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5493-0287","authenticated-orcid":false,"given":"Roberto L.","family":"Castro","sequence":"additional","affiliation":[{"name":"CITIC, Universidade da Coru\u00f1a, A Coru\u00f1a, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5337-5875","authenticated-orcid":false,"given":"Jiale","family":"Chen","sequence":"additional","affiliation":[{"name":"ISTA, Klosterneuburg, Austria"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1333-9797","authenticated-orcid":false,"given":"Torsten","family":"Hoefler","sequence":"additional","affiliation":[{"name":"ETH Z\u00fcrich, Z\u00fcrich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3650-940X","authenticated-orcid":false,"given":"Dan","family":"Alistarh","sequence":"additional","affiliation":[{"name":"ISTA, Klosterneuburg, Austria, Neural Magic, Inc., Somerville, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,2,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Quarot: Outlier-free 4-bit inference in rotated llms. arXiv preprint arXiv:2404.00456","author":"Ashkboos Saleh","year":"2024","unstructured":"Saleh Ashkboos, Amirkeivan Mohtashami, Maximilian L Croci, Bo Li, Martin Jaggi, Dan Alistarh, Torsten Hoefler, and James Hensman. 2024. Quarot: Outlier-free 4-bit inference in rotated llms. arXiv preprint arXiv:2404.00456 (2024)."},{"key":"e_1_3_2_1_2_1","volume-title":"Faster and Lighter LLMs: A Survey on Current Challenges and Way Forward. arXiv preprint arXiv:2402.01799","author":"Chavan Arnav","year":"2024","unstructured":"Arnav Chavan, Raghav Magazine, Shubham Kushwaha, M\u00e9rouane Debbah, and Deepak Gupta. 2024. Faster and Lighter LLMs: A Survey on Current Challenges and Way Forward. arXiv preprint arXiv:2402.01799 (2024)."},{"key":"e_1_3_2_1_3_1","unstructured":"Jerry Chee Yaohui Cai Volodymyr Kuleshov and Christopher De Sa. 2023. QuIP: 2-Bit Quantization of Large Language Models With Guarantees. arXiv:2307.13304 [cs.LG]"},{"key":"e_1_3_2_1_4_1","volume-title":"Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale. Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022 (2022)."},{"key":"e_1_3_2_1_5_1","volume-title":"SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression. arXiv preprint arXiv:2306.03078","author":"Dettmers Tim","year":"2023","unstructured":"Tim Dettmers, Ruslan Svirschevski, Vage Egiazarian, Denis Kuznedelev, Elias Frantar, Saleh Ashkboos, Alexander Borzunov, Torsten Hoefler, and Dan Alistarh. 2023. SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression. arXiv preprint arXiv:2306.03078 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"The case for 4-bit precision: k-bit Inference Scaling Laws. arXiv preprint arXiv:2212.09720","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers and Luke Zettlemoyer. 2022. The case for 4-bit precision: k-bit Inference Scaling Laws. arXiv preprint arXiv:2212.09720 (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"Extreme compression of large language models via additive quantization. arXiv preprint arXiv:2401.06118","author":"Egiazarian Vage","year":"2024","unstructured":"Vage Egiazarian, Andrei Panferov, Denis Kuznedelev, Elias Frantar, Artem Babenko, and Dan Alistarh. 2024. Extreme compression of large language models via additive quantization. arXiv preprint arXiv:2401.06118 (2024)."},{"key":"e_1_3_2_1_8_1","unstructured":"ExLlamaV2. 2024. Exllamav2: A memory efficient fork of HF Transformers optimized for LLaMA models. https:\/\/github.com\/turboderp\/exllamav2. Accessed: 2024-08-15."},{"key":"e_1_3_2_1_9_1","volume-title":"SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot. In International Conference on Machine Learning (ICML).","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar and Dan Alistarh. 2023. SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_10_1","volume-title":"GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_11_1","unstructured":"Andrew Griffin. 2024. ChatGPT creators OpenAI are generating 100 billion words per day CEO says. https:\/\/www.independent.co.uk\/tech\/chatgpt-openai-words-sam-altman-b2494900.html"},{"key":"e_1_3_2_1_12_1","unstructured":"Mark Harris et al. 2007. Optimizing parallel reduction in CUDA. Nvidia developer technology (2007)."},{"key":"e_1_3_2_1_13_1","volume-title":"Who Says Elephants Can't Run: Bringing Large Scale MoE Models into Cloud Scale Production. arXiv preprint arXiv:2211.10017","author":"Kim Young Jin","year":"2022","unstructured":"Young Jin Kim, Rawn Henry, Raffy Fahim, and Hany Hassan Awadalla. 2022. Who Says Elephants Can't Run: Bringing Large Scale MoE Models into Cloud Scale Production. arXiv preprint arXiv:2211.10017 (2022)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_15_1","volume-title":"AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. arXiv preprint arXiv:2306.00978","author":"Lin Ji","year":"2023","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Xingyu Dang, and Song Han. 2023. AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. arXiv preprint arXiv:2306.00978 (2023)."},{"key":"e_1_3_2_1_16_1","unstructured":"NVIDIA. 2020. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf."},{"key":"e_1_3_2_1_17_1","unstructured":"NVIDIA. 2022. NVIDIA A10 Datasheet. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/a10\/pdf\/datasheet-new\/nvidia-a10-datasheet.pdf."},{"key":"e_1_3_2_1_18_1","unstructured":"NVIDIA. 2022. NVIDIA Instruction Set. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html#warp-level-matrix-instructions-for-sparse-mma."},{"key":"e_1_3_2_1_19_1","unstructured":"NVIDIA. 2024. CUTLASS Convolution. https:\/\/github.com\/NVIDIA\/cutlass\/blob\/main\/media\/docs\/implicit_gemm_convolution.md."},{"key":"e_1_3_2_1_20_1","unstructured":"NVIDIA. 2024. Efficient GEMM in CUDA. https:\/\/github.com\/NVIDIA\/cutlass\/blob\/main\/media\/docs\/efficient_gemm.md."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577479"},{"key":"e_1_3_2_1_22_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_23_1","volume-title":"Language models are unsupervised multitask learners. OpenAI blog 1, 8","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. 2019. Language models are unsupervised multitask learners. OpenAI blog 1, 8 (2019), 9."},{"key":"e_1_3_2_1_24_1","unstructured":"Wenqi Shao Mengzhao Chen Zhaoyang Zhang Peng Xu Lirui Zhao Zhiqian Li Kaipeng Zhang Peng Gao Yu Qiao and Ping Luo. 2023. OmniQuant: Omnidirectionally Calibrated Quantization for Large Language Models. arXiv:2308.13137 [cs.LG]"},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 31094--31116","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. 2023. Flexgen: High-throughput generative inference of large language models with a single gpu. In International Conference on Machine Learning. PMLR, 31094--31116."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3217824"},{"key":"e_1_3_2_1_27_1","unstructured":"TII UAE. 2023. The Falcon Family of Large Language Models. https:\/\/huggingface.co\/tiiuae."},{"key":"e_1_3_2_1_28_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_29_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models. arXiv preprint arXiv:2211.10438","author":"Xiao Guangxuan","year":"2022","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Julien Demouth, and Song Han. 2022. SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models. arXiv preprint arXiv:2211.10438 (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"Xi Victoria Lin, et al","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. 2022. OPT: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."},{"key":"e_1_3_2_1_32_1","volume-title":"QQQ: Quality Quattuor-Bit Quantization for Large Language Models. arXiv:2406.09904 [cs.LG]","author":"Zhang Ying","year":"2024","unstructured":"Ying Zhang, Peng Zhang, Mincong Huang, Jingyang Xiang, Yujie Wang, Chao Wang, Yineng Zhang, Lei Yu, Chuan Liu, and Wei Lin. 2024. QQQ: Quality Quattuor-Bit Quantization for Large Language Models. arXiv:2406.09904 [cs.LG]"}],"event":{"name":"PPoPP '25: The 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Las Vegas NV USA","acronym":"PPoPP '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710871","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3710848.3710871","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:16:01Z","timestamp":1755875761000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710871"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,28]]},"references-count":32,"alternative-id":["10.1145\/3710848.3710871","10.1145\/3710848"],"URL":"https:\/\/doi.org\/10.1145\/3710848.3710871","relation":{},"subject":[],"published":{"date-parts":[[2025,2,28]]},"assertion":[{"value":"2025-02-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}