{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,15]],"date-time":"2026-01-15T22:07:22Z","timestamp":1768514842855,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":140,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,24]],"date-time":"2024-08-24T00:00:00Z","timestamp":1724457600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,25]]},"DOI":"10.1145\/3637528.3671465","type":"proceedings-article","created":{"date-parts":[[2024,8,25]],"date-time":"2024-08-25T04:54:55Z","timestamp":1724561695000},"page":"6605-6615","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Inference Optimization of Foundation Models on AI Accelerators"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0970-9214","authenticated-orcid":false,"given":"Youngsuk","family":"Park","sequence":"first","affiliation":[{"name":"AWS AI, Santa Clara, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5255-8642","authenticated-orcid":false,"given":"Kailash","family":"Budhathoki","sequence":"additional","affiliation":[{"name":"AWS AI, T\u00fcbingen, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2175-7016","authenticated-orcid":false,"given":"Liangfu","family":"Chen","sequence":"additional","affiliation":[{"name":"AWS AI, Santa Clara, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8634-7990","authenticated-orcid":false,"given":"Jonas M.","family":"K\u00fcbler","sequence":"additional","affiliation":[{"name":"AWS AI, T\u00fcbingen, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4033-4795","authenticated-orcid":false,"given":"Jiaji","family":"Huang","sequence":"additional","affiliation":[{"name":"AWS AI, Santa Clara, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9907-4610","authenticated-orcid":false,"given":"Matth\u00e4us","family":"Kleindessner","sequence":"additional","affiliation":[{"name":"AWS AI, T\u00fcbingen, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6440-4732","authenticated-orcid":false,"given":"Jun","family":"Huan","sequence":"additional","affiliation":[{"name":"AWS AI, Santa Clara, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5004-201X","authenticated-orcid":false,"given":"Volkan","family":"Cevher","sequence":"additional","affiliation":[{"name":"AWS AI &amp; EPFL, T\u00fcbingen, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8165-840X","authenticated-orcid":false,"given":"Yida","family":"Wang","sequence":"additional","affiliation":[{"name":"AWS AI, Santa Clara, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2753-1437","authenticated-orcid":false,"given":"George","family":"Karypis","sequence":"additional","affiliation":[{"name":"AWS AI, Santa Clara, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,8,24]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. arXiv preprint arXiv:2403.02310","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. arXiv preprint arXiv:2403.02310 (2024)."},{"key":"e_1_3_2_1_2_1","volume-title":"Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. 2023. Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245 (2023)."},{"key":"e_1_3_2_1_3_1","unstructured":"Ebtesam Almazrouei Hamza Alobeidli Abdulaziz Alshamsi Alessandro Cappelli Ruxandra Cojocaru M\u00e9rouane Debbah \u00c9tienne Goffinet Daniel Hesslow Julien Launay Quentin Malartic et al. 2023. The falcon series of open language models. arXiv preprint arXiv:2311.16867 (2023)."},{"key":"e_1_3_2_1_4_1","unstructured":"AMD. 2024. AMD matrix cores. https:\/\/rocm.blogs.amd.com\/software-tools-optimization\/matrix-cores\/README.html"},{"key":"e_1_3_2_1_5_1","volume-title":"Reference: Lossless Acceleration of Large Language Models. arXiv preprint arXiv:2304.04487","author":"Ge N. Yang","year":"2023","unstructured":"N. Yang amd T. Ge, L. Wang, B. Jiao, D. Jiang, L. Yang, R. Majumder, and F. Wei. 2023. Inference with Reference: Lossless Acceleration of Large Language Models. arXiv preprint arXiv:2304.04487 (2023)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_7_1","unstructured":"Quentin Anthony Yury Tokpanov Paolo Glorioso and Beren Millidge. 2024. BlackMamba: Mixture of Experts for State-Space Models. arxiv: 2402.01771 [cs.CL]"},{"key":"e_1_3_2_1_8_1","unstructured":"Anthropic. 2024. Introducing the Next Generation of Claude. https:\/\/www.anthropic.com\/news\/claude-3-family"},{"key":"e_1_3_2_1_9_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum\"id=vXxardq6db","author":"Ashkboos Saleh","year":"2024","unstructured":"Saleh Ashkboos, Maximilian L. Croci, Marcelo Gennari do Nascimento, Torsten Hoefler, and James Hensman. 2024. SliceGPT: Compress Large Language Models by Deleting Rows and Columns. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum\"id=vXxardq6db"},{"key":"e_1_3_2_1_10_1","unstructured":"AWS-Neuron. 2024. https:\/\/awsdocs-neuron.readthedocs-hosted.com\/en\/latest\/general\/arch\/neuron-features\/collective-communication.html."},{"key":"e_1_3_2_1_11_1","unstructured":"AWS-Neuron. 2024. https:\/\/github.com\/aws-neuron\/transformers-neuronx."},{"key":"e_1_3_2_1_12_1","volume-title":"Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150","author":"Beltagy Iz","year":"2020","unstructured":"Iz Beltagy, Matthew E Peters, and Arman Cohan. 2020. Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150 (2020)."},{"key":"e_1_3_2_1_13_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3655592"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Cristian Bucila Rich Caruana and Alexandru Niculescu-Mizil. 2006. Model compression.. In KDD. ACM 535--541.","DOI":"10.1145\/1150402.1150464"},{"key":"e_1_3_2_1_16_1","volume-title":"Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads. arXiv preprint arXiv:2401.10774","author":"Cai T.","year":"2024","unstructured":"T. Cai, Y. Li, Z. Geng, H. Peng, J. D. Lee, D. Chen, and T. Dao. 2024. Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads. arXiv preprint arXiv:2401.10774 (2024)."},{"key":"e_1_3_2_1_17_1","unstructured":"Harrison Chase. 2022. LangChain. https:\/\/github.com\/langchain-ai\/langchain"},{"key":"e_1_3_2_1_18_1","unstructured":"Charlie Chen Sebastian Borgeaud Geoffrey Irving Jean-Baptiste Lespiau Laurent Sifre and John Jumper. 2023. Accelerating Large Language Model Decoding with Speculative Sampling. arxiv: 2302.01318"},{"key":"e_1_3_2_1_19_1","volume-title":"Accelerating large language model decoding with speculative sampling. arXiv preprint 2302.01318","author":"Chen Charlie","year":"2023","unstructured":"Charlie Chen, Sebastian Borgeaud, Geoffrey Irving, Jean-Baptiste Lespiau1, Laurent Sifre1, and John Jumper. 2023. Accelerating large language model decoding with speculative sampling. arXiv preprint 2302.01318 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW).","author":"Chen Y.","unstructured":"Y. Chen, R. Sarokin, J. Lee, J. Tang, C. Chang, A. Kulik, and M. Grundmann. 2023. Speed Is All You Need: On-Device Acceleration of Large Diffusion Models via GPU-Aware Optimizations. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)."},{"key":"e_1_3_2_1_21_1","volume-title":"Sequoia: Scalable, Robust, and Hardware-aware Speculative Decoding. arxiv: 2402.12374 [cs.CL]","author":"Chen Z.","year":"2024","unstructured":"Z. Chen, A. May, R. Svirschevski, Y. Huang, M. Ryabinin, Z. Jia, and B. Chen. 2024. Sequoia: Scalable, Robust, and Hardware-aware Speculative Decoding. arxiv: 2402.12374 [cs.CL]"},{"key":"e_1_3_2_1_22_1","volume-title":"Somani","author":"Chitty-Venkata Krishna Teja","year":"2023","unstructured":"Krishna Teja Chitty-Venkata, Sparsh Mittal, Murali Emani, Venkatram Vishwanath, and Arun K. Somani. 2023. A Survey of Techniques for Optimizing Transformer Inference. arxiv: 2307.07982 [cs.LG]"},{"key":"e_1_3_2_1_23_1","volume-title":"Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. 2023. Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691 (2023)."},{"key":"e_1_3_2_1_24_1","first-page":"16344","article-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. Advances in Neural Information Processing Systems, Vol. 35 (2022), 16344--16359.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_25_1","unstructured":"Tim Dettmers Mike Lewis Younes Belkada and Luke Zettlemoyer. 2022. LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale. arxiv: 2208.07339 [cs.LG]"},{"key":"e_1_3_2_1_26_1","unstructured":"Tim Dettmers Artidoro Pagnoni Ari Holtzman and Luke Zettlemoyer. 2023. QLoRA: Efficient Finetuning of Quantized LLMs. arxiv: 2305.14314 [cs.LG]"},{"key":"e_1_3_2_1_27_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv: 1810.04805 [cs.CL]"},{"key":"e_1_3_2_1_28_1","unstructured":"Vage Egiazarian Andrei Panferov Denis Kuznedelev Elias Frantar Artem Babenko and Dan Alistarh. 2024. Extreme Compression of Large Language Models via Additive Quantization. arxiv: 2401.06118 [cs.LG]"},{"key":"e_1_3_2_1_29_1","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research, Vol. 23, 120 (2022), 1--39.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.37"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3144376"},{"key":"e_1_3_2_1_32_1","volume-title":"International Conference on Machine Learning. PMLR, 10323--10337","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar and Dan Alistarh. 2023. Sparsegpt: Massive language models can be accurately pruned in one-shot. In International Conference on Machine Learning. PMLR, 10323--10337."},{"key":"e_1_3_2_1_33_1","volume-title":"GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. arxiv: 2210.17323 [cs.LG]","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2023. GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. arxiv: 2210.17323 [cs.LG]"},{"key":"e_1_3_2_1_34_1","volume-title":"Advances in Neural Information Processing Systems","volume":"9","author":"Fritsch J\u00fcrgen","year":"1996","unstructured":"J\u00fcrgen Fritsch, Michael Finke, and Alex Waibel. 1996. Adaptively growing hierarchical mixtures of experts. Advances in Neural Information Processing Systems, Vol. 9 (1996)."},{"key":"e_1_3_2_1_35_1","volume-title":"Break the sequential dependency of llm inference using lookahead decoding. arXiv preprint arXiv:2402.02057","author":"Fu Yichao","year":"2024","unstructured":"Yichao Fu, Peter Bailis, Ion Stoica, and Hao Zhang. 2024. Break the sequential dependency of llm inference using lookahead decoding. arXiv preprint arXiv:2402.02057 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Zhong-Yi Lu, and Ji-Rong Wen.","author":"Gao Ze-Feng","year":"2022","unstructured":"Ze-Feng Gao, Peiyu Liu, Wayne Xin Zhao, Zhong-Yi Lu, and Ji-Rong Wen. 2022. Parameter-efficient mixture-of-experts architecture for pre-trained language models. arXiv preprint arXiv:2203.01104 (2022)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Amir Gholami Sehoon Kim Zhen Dong Zhewei Yao Michael W. Mahoney and Kurt Keutzer. 2021. A Survey of Quantization Methods for Efficient Neural Network Inference. arxiv: 2103.13630 [cs.CV]","DOI":"10.1201\/9781003162810-13"},{"key":"e_1_3_2_1_38_1","volume-title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces. arxiv: 2312.00752 [cs.LG]","author":"Gu Albert","year":"2024","unstructured":"Albert Gu and Tri Dao. 2024. Mamba: Linear-Time Sequence Modeling with Selective State Spaces. arxiv: 2312.00752 [cs.LG]"},{"key":"e_1_3_2_1_39_1","volume-title":"EFFICIENTLY MODELING LONG SEQUENCES WITH STRUCTURED STATE SPACES. In International Conference on Learning Representations.","author":"Gu Albert","year":"2022","unstructured":"Albert Gu, Karan Goel, and Christopher R\u00e9. 2022. EFFICIENTLY MODELING LONG SEQUENCES WITH STRUCTURED STATE SPACES. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_40_1","unstructured":"Guidance-AI. 2023. A guidance language for controlling large language models. https:\/\/github.com\/guidance-ai\/guidance. https:\/\/github.com\/guidance-ai\/guidance"},{"key":"e_1_3_2_1_42_1","volume-title":"PTQD: Accurate Post-Training Quantization for Diffusion Models. In Advances in Neural Information Processing Systems.","author":"He Yefei","year":"2023","unstructured":"Yefei He, Luping Liu, Jing Liu, Weijia Wu, Hong Zhou, and Bohan Zhuang. 2023. PTQD: Accurate Post-Training Quantization for Diffusion Models. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_43_1","volume-title":"Rest: Retrieval-based speculative decoding. arXiv preprint arXiv:2311.08252","author":"He Zhenyu","year":"2023","unstructured":"Zhenyu He, Zexuan Zhong, Tianle Cai, Jason D Lee, and Di He. 2023. Rest: Retrieval-based speculative decoding. arXiv preprint arXiv:2311.08252 (2023)."},{"key":"e_1_3_2_1_44_1","unstructured":"Geoffrey Hinton Oriol Vinyals and Jeff Dean. 2015. Distilling the Knowledge in a Neural Network. arxiv: 1503.02531 [stat.ML]"},{"key":"e_1_3_2_1_45_1","volume-title":"Kurt Keutzer, and Amir Gholami.","author":"Hooper Coleman","year":"2024","unstructured":"Coleman Hooper, Sehoon Kim, Hiva Mohammadzadeh, Michael W Mahoney, Yakun Sophia Shao, Kurt Keutzer, and Amir Gholami. 2024. KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache Quantization. arXiv preprint arXiv:2401.18079 (2024)."},{"key":"e_1_3_2_1_46_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_47_1","unstructured":"HuggingFace. 2023. https:\/\/github.com\/huggingface\/text-generation-inference."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Benoit Jacob Skirmantas Kligys Bo Chen Menglong Zhu Matthew Tang Andrew Howard Hartwig Adam and Dmitry Kalenichenko. 2017. Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference. arxiv: 1712.05877 [cs.LG]","DOI":"10.1109\/CVPR.2018.00286"},{"key":"e_1_3_2_1_49_1","unstructured":"Albert Q. Jiang Alexandre Sablayrolles Arthur Mensch Chris Bamford Devendra Singh Chaplot Diego de las Casas Florian Bressand Gianna Lengyel Guillaume Lample Lucile Saulnier L\u00e9lio Renard Lavaud Marie-Anne Lachaux Pierre Stock Teven Le Scao Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William El Sayed. 2023. Mistral 7B. arxiv: 2310.06825 [cs.CL]"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3154484"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_53_1","volume-title":"Dspy: Compiling declarative language model calls into self-improving pipelines. arXiv preprint arXiv:2310.03714","author":"Khattab Omar","year":"2023","unstructured":"Omar Khattab, Arnav Singhvi, Paridhi Maheshwari, Zhiyuan Zhang, Keshav Santhanam, Sri Vardhamanan, Saiful Haq, Ashutosh Sharma, Thomas T Joshi, Hanna Moazam, et al. 2023. Dspy: Compiling declarative language model calls into self-improving pipelines. arXiv preprint arXiv:2310.03714 (2023)."},{"key":"e_1_3_2_1_54_1","unstructured":"S. Kim K. Mangalam S. Moon J. Malik M. Mahoney A. Gholami and K. Keutzer. 2024. peculative decoding with big little decoder. Advances in Neural Information Processing Systems (2024)."},{"key":"e_1_3_2_1_55_1","volume-title":"Basil Mustafa, Joshua Ainslie, Yi Tay, Mostafa Dehghani, and Neil Houlsby.","author":"Komatsuzaki Aran","year":"2022","unstructured":"Aran Komatsuzaki, Joan Puigcerver, James Lee-Thorp, Carlos Riquelme Ruiz, Basil Mustafa, Joshua Ainslie, Yi Tay, Mostafa Dehghani, and Neil Houlsby. 2022. Sparse upcycling: Training mixture-of-experts from dense checkpoints. arXiv preprint arXiv:2212.05055 (2022)."},{"key":"e_1_3_2_1_56_1","volume-title":"Levine (Eds.)","volume":"36","author":"Kurti\u00e7 Eldar","year":"2023","unstructured":"Eldar Kurti\u00e7, Elias Frantar, and Dan Alistarh. 2023. ZipLM: Inference-Aware Structured Pruning of Language Models. In Advances in Neural Information Processing Systems, A. Oh, T. Neumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 65597--65617. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/ced46a50befedcb884ccf0cbe8c3ad23-Paper-Conference.pdf"},{"key":"e_1_3_2_1_57_1","volume-title":"Jin-Hwa Kim, Baeseong Park, Byeongwook Kim, Jung-Woo Ha, Nako Sung, and Dongsoo Lee.","author":"Kwon Se Jung","year":"2022","unstructured":"Se Jung Kwon, Jeonghoon Kim, Jeongin Bae, Kang Min Yoo, Jin-Hwa Kim, Baeseong Park, Byeongwook Kim, Jung-Woo Ha, Nako Sung, and Dongsoo Lee. 2022. AlphaTuning: Quantization-Aware Parameter-Efficient Adaptation of Large-Scale Pre-Trained Language Models. arxiv: 2210.03858 [cs.LG]"},{"key":"e_1_3_2_1_58_1","unstructured":"Woosuk Kwon. 2023. Implement PagedAttention V2 by WoosukKwon \u00b7 Pull Request #1348 \u00b7 vllm-project\/vllm. https:\/\/github.com\/vllm-project\/vllm\/pull\/1348"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_60_1","volume-title":"Block pruning for faster transformers. arXiv preprint arXiv:2109.04838","author":"Lagunas Franccois","year":"2021","unstructured":"Franccois Lagunas, Ella Charlaix, Victor Sanh, and Alexander M Rush. 2021. Block pruning for faster transformers. arXiv preprint arXiv:2109.04838 (2021)."},{"key":"e_1_3_2_1_61_1","volume-title":"Dynamic Sparse Training with Structured Sparsity. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum\"id=kOBkxFRKTA","author":"Lasby Mike","year":"2024","unstructured":"Mike Lasby, Anna Golubeva, Utku Evci, Mihai Nica, and Yani Ioannou. 2024. Dynamic Sparse Training with Structured Sparsity. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum\"id=kOBkxFRKTA"},{"key":"e_1_3_2_1_62_1","volume-title":"GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=qrwe7XHTmYb","author":"Lepikhin Dmitry","year":"2021","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2021. GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=qrwe7XHTmYb"},{"key":"e_1_3_2_1_63_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Mantan Kalman, and Yossi Matias. 2023. Fast inference from transformers via speculative decoding. In International Conference on Machine Learning. PMLR, 19274--19286."},{"key":"e_1_3_2_1_64_1","volume-title":"International Conference on Machine Learning. PMLR, 6265--6274","author":"Lewis Mike","year":"2021","unstructured":"Mike Lewis, Shruti Bhosale, Tim Dettmers, Naman Goyal, and Luke Zettlemoyer. 2021. Base layers: Simplifying training of large, sparse models. In International Conference on Machine Learning. PMLR, 6265--6274."},{"key":"e_1_3_2_1_65_1","volume-title":"BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. arxiv","author":"Lewis Mike","year":"2019","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer. 2019. BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. arxiv: 1910.13461 [cs.CL]"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3370748.3406567"},{"key":"e_1_3_2_1_67_1","unstructured":"Muyang Li Ji Lin Chenlin Meng Stefano Ermon Song Han and Jun-Yan Zhu. 2023. Efficient Spatially Sparse Inference for Conditional GANs and Diffusion Models. arxiv: 2211.02048 [cs.CV]"},{"key":"e_1_3_2_1_68_1","volume-title":"Sequence Parallelism: Long Sequence Training from System Perspective. arxiv: 2105.13120","author":"Li Shenggui","year":"2022","unstructured":"Shenggui Li, Fuzhao Xue, Chaitanya Baranwal, Yongbin Li, and Yang You. 2022. Sequence Parallelism: Long Sequence Training from System Perspective. arxiv: 2105.13120"},{"key":"e_1_3_2_1_69_1","unstructured":"Xiuyu Li Yijiang Liu Long Lian Huanrui Yang Zhen Dong Daniel Kang Shanghang Zhang and Kurt Keutzer. 2023. Q-Diffusion: Quantizing Diffusion Models. arxiv: 2302.04304 [cs.CV]"},{"key":"e_1_3_2_1_70_1","volume-title":"EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty. arXiv preprint arXiv:2401.15077","author":"Li Y.","year":"2024","unstructured":"Y. Li, F. Wei, C. Zhang, and H. Zhang. 2024. EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty. arXiv preprint arXiv:2401.15077 (2024)."},{"key":"e_1_3_2_1_71_1","unstructured":"Chen Liang Simiao Zuo Qingru Zhang Pengcheng He Weizhu Chen and Tuo Zhao. 2023. Less is More: Task-aware Layer-wise Distillation for Language Model Compression. arxiv: 2210.01351 [cs.CL]"},{"key":"e_1_3_2_1_72_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Liu Hao","year":"2024","unstructured":"Hao Liu and Pieter Abbeel. 2024. Blockwise Parallel Transformers for Large Context Models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_73_1","volume-title":"Ring attention with blockwise transformers for near-infinite context. arXiv preprint arXiv:2310.01889","author":"Liu Hao","year":"2023","unstructured":"Hao Liu, Matei Zaharia, and Pieter Abbeel. 2023. Ring attention with blockwise transformers for near-infinite context. arXiv preprint arXiv:2310.01889 (2023)."},{"key":"e_1_3_2_1_74_1","volume-title":"QLLM: Accurate and Efficient Low-Bitwidth Quantization for Large Language Models. arxiv: 2310.08041 [cs.CL]","author":"Liu Jing","year":"2024","unstructured":"Jing Liu, Ruihao Gong, Xiuying Wei, Zhiwei Dong, Jianfei Cai, and Bohan Zhuang. 2024. QLLM: Accurate and Efficient Low-Bitwidth Quantization for Large Language Models. arxiv: 2310.08041 [cs.CL]"},{"key":"e_1_3_2_1_75_1","unstructured":"Zechun Liu Barlas Oguz Changsheng Zhao Ernie Chang Pierre Stock Yashar Mehdad Yangyang Shi Raghuraman Krishnamoorthi and Vikas Chandra. 2023. LLM-QAT: Data-Free Quantization Aware Training for Large Language Models. arxiv: 2305.17888 [cs.CL]"},{"key":"e_1_3_2_1_76_1","unstructured":"Raphael Gontijo Lopes Stefano Fenu and Thad Starner. 2017. Data-Free Knowledge Distillation for Deep Neural Networks. arxiv: 1710.07535 [cs.LG]"},{"key":"e_1_3_2_1_77_1","unstructured":"Shuming Ma Hongyu Wang Lingxiao Ma Lei Wang Wenhui Wang Shaohan Huang Li Dong Ruiping Wang Jilong Xue and Furu Wei. 2024. The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits. arxiv: 2402.17764 [cs.CL]"},{"key":"e_1_3_2_1_78_1","volume-title":"Llm-pruner: On the structural pruning of large language models. Advances in neural information processing systems","author":"Ma Xinyin","year":"2023","unstructured":"Xinyin Ma, Gongfan Fang, and Xinchao Wang. 2023. Llm-pruner: On the structural pruning of large language models. Advances in neural information processing systems, Vol. 36 (2023), 21702--21720."},{"key":"e_1_3_2_1_79_1","unstructured":"Vladimir Malinovskii Denis Mazur Ivan Ilin Denis Kuznedelev Konstantin Burlachenko Kai Yi Dan Alistarh and Peter Richtarik. 2024. PV-Tuning: Beyond Straight-Through Estimation for Extreme LLM Compression. arxiv: 2405.14852 [cs.LG]"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"crossref","unstructured":"Chenlin Meng Robin Rombach Ruiqi Gao Diederik P. Kingma Stefano Ermon Jonathan Ho and Tim Salimans. 2023. On Distillation of Guided Diffusion Models. arxiv: 2210.03142 [cs.CV]","DOI":"10.1109\/CVPR52729.2023.01374"},{"key":"e_1_3_2_1_81_1","volume-title":"Specinfer: Accelerating generative llm serving with speculative inference and token tree verification. arXiv preprint arXiv:2305.09781","author":"Miao X.","year":"2023","unstructured":"X. Miao, G. Oliaro, Z. Zhang, X. Cheng, Z. Wang, R. Y. Wong, Z. Chen, D. Arfeen, R. Abhyankar, and Z. Jia. 2023. Specinfer: Accelerating generative llm serving with speculative inference and token tree verification. arXiv preprint arXiv:2305.09781 (2023)."},{"key":"e_1_3_2_1_82_1","unstructured":"Paulius Micikevicius Dusan Stosic Neil Burgess Marius Cornea Pradeep Dubey Richard Grisenthwaite Sangwon Ha Alexander Heinecke Patrick Judd John Kamalu et al. 2022. Fp8 formats for deep learning. arXiv preprint arXiv:2209.05433 (2022)."},{"key":"e_1_3_2_1_83_1","unstructured":"G. Monea A. Joulin and E. Grave. 2023. PaSS: Parallel Speculative Sampling. arXiv preprint arXiv:2311.13581 (2023)."},{"key":"e_1_3_2_1_84_1","unstructured":"NVIDIA. 2022. https:\/\/github.com\/NVIDIA\/FasterTransformer."},{"key":"e_1_3_2_1_85_1","unstructured":"NVIDIA. 2024. https:\/\/github.com\/NVIDIA\/TensorRT-LLM."},{"key":"e_1_3_2_1_86_1","unstructured":"NVIDIA. 2024. NVIDIA Tensor Cores. https:\/\/www.nvidia.com\/en-us\/data-center\/tensor-cores\/"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"crossref","unstructured":"Charith Peris Lizhen Tan Thomas Gueudre Turan Gojayev Pan Wei and Gokmen Oz. 2022. Knowledge Distillation Transfer Sets and their Impact on Downstream NLU Tasks. arxiv: 2210.04834 [cs.CL]","DOI":"10.18653\/v1\/2022.emnlp-industry.12"},{"key":"e_1_3_2_1_88_1","unstructured":"Maciej Pi\u00f3ro Kamil Ciebiera Krystian Kr\u00f3l Jan Ludziejewski Micha? Krutul Jakub Krajewski Szymon Antoniak Piotr Mi?o? Marek Cygan and Sebastian Jaszczur. 2024. MoE-Mamba: Efficient Selective State Space Models with Mixture of Experts. arxiv: 2401.04081 [cs.LG]"},{"key":"e_1_3_2_1_89_1","unstructured":"Jeff Pool Abhishek Sawarkar and Jay Rodge. 2021. Accelerating Inference with Sparsity Using the NVIDIA Ampere Architecture and NVIDIA TensorRT. https:\/\/developer.nvidia.com\/blog\/accelerating-inference-with-sparsity-using-ampere-and-tensorrt\/"},{"key":"e_1_3_2_1_90_1","volume-title":"Proceedings of Machine Learning and Systems","volume":"5","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2023. Efficiently scaling transformer inference. Proceedings of Machine Learning and Systems, Vol. 5 (2023)."},{"key":"e_1_3_2_1_91_1","unstructured":"Princeton-NLP. 2023. https:\/\/princeton-nlp.github.io\/flash-decoding\/."},{"key":"e_1_3_2_1_92_1","volume-title":"International conference on machine learning. PMLR","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale. In International conference on machine learning. PMLR, 18332--18346."},{"key":"e_1_3_2_1_93_1","volume-title":"Peter Conway Humphreys, and Adam Santoro","author":"Raposo David","year":"2024","unstructured":"David Raposo, Sam Ritter, Blake Richards, Timothy Lillicrap, Peter Conway Humphreys, and Adam Santoro. 2024. Mixture-of-Depths: Dynamically allocating compute in transformer-based language models. arxiv: 2404.02258 [cs.LG]"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_95_1","unstructured":"Tim Salimans and Jonathan Ho. 2022. Progressive Distillation for Fast Sampling of Diffusion Models. arxiv: 2202.00512 [cs.LG]"},{"key":"e_1_3_2_1_96_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. arxiv","author":"Sanh Victor","year":"1910","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2020. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arxiv: 1910.01108 [cs.CL]"},{"key":"e_1_3_2_1_97_1","volume-title":"Movement pruning: Adaptive sparsity by fine-tuning. Advances in neural information processing systems","author":"Sanh Victor","year":"2020","unstructured":"Victor Sanh, Thomas Wolf, and Alexander Rush. 2020. Movement pruning: Adaptive sparsity by fine-tuning. Advances in neural information processing systems, Vol. 33 (2020), 20378--20389."},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"crossref","unstructured":"A. Santilli S. Severino E. Postolache V. Maiorca M. Mancusi R. Marin and E. Rodol\u00e0. 2023. Accelerating transformer inference for translation via parallel decoding. arXiv preprint arXiv:2305.10427 (2023).","DOI":"10.18653\/v1\/2023.acl-long.689"},{"key":"e_1_3_2_1_99_1","unstructured":"Tal Schuster Adam Fisch Jai Gupta Mostafa Dehghani Dara Bahri Vinh Q. Tran Yi Tay and Donald Metzler. 2022. Confident Adaptive Language Modeling. arxiv: 2207.07061 [cs.CL]"},{"key":"e_1_3_2_1_100_1","doi-asserted-by":"crossref","unstructured":"Yuzhang Shang Zhihang Yuan Bin Xie Bingzhe Wu and Yan Yan. 2023. Post-training Quantization on Diffusion Models. arxiv: 2211.15736 [cs.CV]","DOI":"10.1109\/CVPR52729.2023.00196"},{"key":"e_1_3_2_1_101_1","volume-title":"One-Shot Sensitivity-Aware Mixed Sparsity Pruning for Large Language Models. arXiv preprint arXiv:2310.09499","author":"Shao Hang","year":"2023","unstructured":"Hang Shao, Bei Liu, Bo Xiao, Ke Zeng, Guanglu Wan, and Yanmin Qian. 2023. One-Shot Sensitivity-Aware Mixed Sparsity Pruning for Large Language Models. arXiv preprint arXiv:2310.09499 (2023)."},{"key":"e_1_3_2_1_102_1","unstructured":"Wenqi Shao Mengzhao Chen Zhaoyang Zhang Peng Xu Lirui Zhao Zhiqian Li Kaipeng Zhang Peng Gao Yu Qiao and Ping Luo. 2024. OmniQuant: Omnidirectionally Calibrated Quantization for Large Language Models. arxiv: 2308.13137 [cs.LG]"},{"key":"e_1_3_2_1_103_1","volume-title":"Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150","author":"Shazeer Noam","year":"2019","unstructured":"Noam Shazeer. 2019. Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150 (2019)."},{"key":"e_1_3_2_1_104_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)."},{"key":"e_1_3_2_1_105_1","volume-title":"InInternational Conference on Machine Learning","author":"Song Y.","year":"2021","unstructured":"Y. Song, C. Meng, R. Liao R, and S. Ermo. 2021. Accelerating feedforward computation via parallel nonlinear equation solving. InInternational Conference on Machine Learning (2021)."},{"key":"e_1_3_2_1_106_1","volume-title":"A simple and effective pruning approach for large language models. arXiv preprint arXiv:2306.11695","author":"Sun Mingjie","year":"2023","unstructured":"Mingjie Sun, Zhuang Liu, Anna Bair, and J Zico Kolter. 2023. A simple and effective pruning approach for large language models. arXiv preprint arXiv:2306.11695 (2023)."},{"key":"e_1_3_2_1_107_1","volume-title":"Patient Knowledge Distillation for BERT Model Compression. arxiv","author":"Sun Siqi","year":"1908","unstructured":"Siqi Sun, Yu Cheng, Zhe Gan, and Jingjing Liu. 2019. Patient Knowledge Distillation for BERT Model Compression. arxiv: 1908.09355 [cs.CL]"},{"key":"e_1_3_2_1_108_1","volume-title":"Structured Pruning for Efficient Generative Pre-trained Language Models. In Findings of the Association for Computational Linguistics: ACL","author":"Tao Chaofan","year":"2023","unstructured":"Chaofan Tao, Lu Hou, Haoli Bai, Jiansheng Wei, Xin Jiang, Qun Liu, Ping Luo, and Ngai Wong. 2023. Structured Pruning for Efficient Generative Pre-trained Language Models. In Findings of the Association for Computational Linguistics: ACL 2023."},{"key":"e_1_3_2_1_109_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arxiv: 2307.09288 [cs.CL]"},{"key":"e_1_3_2_1_110_1","unstructured":"Albert Tseng Jerry Chee Qingyao Sun Volodymyr Kuleshov and Christopher De Sa. 2024. QuIP#: Even Better LLM Quantization with Hadamard Incoherence and Lattice Codebooks. arxiv: 2402.04396 [cs.LG]"},{"key":"e_1_3_2_1_111_1","unstructured":"Mart van Baalen Andrey Kuzmin Suparna S Nair Yuwei Ren Eric Mahurin Chirag Patel Sundar Subramanian Sanghyuk Lee Markus Nagel Joseph Soriaga and Tijmen Blankevoort. 2023. FP8 versus INT8 for efficient deep learning inference. arxiv: 2303.17951 [cs.LG]"},{"key":"e_1_3_2_1_112_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_113_1","unstructured":"vLLM Contributors. 2024. https:\/\/github.com\/vllm-project\/vllm\/."},{"key":"e_1_3_2_1_114_1","volume-title":"Towards accurate data-free quantization for diffusion models. arXiv preprint arXiv:2305.18723","author":"Wang Changyuan","year":"2023","unstructured":"Changyuan Wang, Ziwei Wang, Xiuwei Xu, Yansong Tang, Jie Zhou, and Jiwen Lu. 2023. Towards accurate data-free quantization for diffusion models. arXiv preprint arXiv:2305.18723 (2023)."},{"key":"e_1_3_2_1_115_1","unstructured":"Hongyu Wang Shuming Ma Li Dong Shaohan Huang Huaijie Wang Lingxiao Ma Fan Yang Ruiping Wang Yi Wu and Furu Wei. 2023. BitNet: Scaling 1-bit Transformers for Large Language Models. arxiv: 2310.11453 [cs.CL]"},{"key":"e_1_3_2_1_116_1","volume-title":"Outlier Suppression: Accurate quantization of large language models by equivalent and optimal shifting and scaling. arxiv: 2304.09145 [cs.CL]","author":"Wei Xiuying","year":"2023","unstructured":"Xiuying Wei, Yunchen Zhang, Yuhang Li, Xiangguo Zhang, Ruihao Gong, Jinyang Guo, and Xianglong Liu. 2023. Outlier Suppression: Accurate quantization of large language models by equivalent and optimal shifting and scaling. arxiv: 2304.09145 [cs.CL]"},{"key":"e_1_3_2_1_117_1","volume-title":"Ximing Lu, Sean Welleck, and Yejin Choi.","author":"West Peter","year":"2022","unstructured":"Peter West, Chandra Bhagavatula, Jack Hessel, Jena D. Hwang, Liwei Jiang, Ronan Le Bras, Ximing Lu, Sean Welleck, and Yejin Choi. 2022. Symbolic Knowledge Distillation: from General Language Models to Commonsense Models. arxiv: 2110.07178 [cs.CL]"},{"key":"e_1_3_2_1_118_1","unstructured":"xai. 2024. Open Release of Grok-1. https:\/\/x.ai\/blog\/grok-os"},{"key":"e_1_3_2_1_119_1","volume-title":"Flash-llm: Enabling cost-effective and highly-efficient large generative model inference with unstructured sparsity. arXiv preprint arXiv:2309.10285","author":"Xia Haojun","year":"2023","unstructured":"Haojun Xia, Zhen Zheng, Yuchao Li, Donglin Zhuang, Zhongzhu Zhou, Xiafei Qiu, Yong Li, Wei Lin, and Shuaiwen Leon Song. 2023. Flash-llm: Enabling cost-effective and highly-efficient large generative model inference with unstructured sparsity. arXiv preprint arXiv:2309.10285 (2023)."},{"key":"e_1_3_2_1_120_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=09iOdaeOzp","author":"Xia Mengzhou","year":"2024","unstructured":"Mengzhou Xia, Tianyu Gao, Zhiyuan Zeng, and Danqi Chen. 2024. Sheared LLaMA: Accelerating Language Model Pre-training via Structured Pruning. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=09iOdaeOzp"},{"key":"e_1_3_2_1_121_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.107"},{"key":"e_1_3_2_1_122_1","unstructured":"Guangxuan Xiao Ji Lin Mickael Seznec Hao Wu Julien Demouth and Song Han. 2023. SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models. arxiv: 2211.10438 [cs.CL]"},{"key":"e_1_3_2_1_123_1","volume-title":"Efficient streaming language models with attention sinks. arXiv preprint arXiv:2309.17453","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Yuandong Tian, Beidi Chen, Song Han, and Mike Lewis. 2023. Efficient streaming language models with attention sinks. arXiv preprint arXiv:2309.17453 (2023)."},{"key":"e_1_3_2_1_124_1","volume-title":"BESA: Pruning Large Language Models with Blockwise Parameter-Efficient Sparsity Allocation. arXiv preprint arXiv:2402.16880","author":"Xu Peng","year":"2024","unstructured":"Peng Xu, Wenqi Shao, Mengzhao Chen, Shitao Tang, Kaipeng Zhang, Peng Gao, Fengwei An, Yu Qiao, and Ping Luo. 2024. BESA: Pruning Large Language Models with Blockwise Parameter-Efficient Sparsity Allocation. arXiv preprint arXiv:2402.16880 (2024)."},{"key":"e_1_3_2_1_125_1","unstructured":"Zhao Yang. 2023. Support FP8-E5M2 KV Cache by zhaoyang-star \u00b7 Pull Request #2279 \u00b7 vllm-project\/vllm. https:\/\/github.com\/vllm-project\/vllm\/pull\/2279"},{"key":"e_1_3_2_1_126_1","unstructured":"Zhewei Yao Xiaoxia Wu Cheng Li Stephen Youn and Yuxiong He. 2023. ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation. arxiv: 2303.08302 [cs.LG]"},{"key":"e_1_3_2_1_127_1","volume-title":"Outlier weighed layerwise sparsity (owl): A missing secret sauce for pruning llms to high sparsity. arXiv preprint arXiv:2310.05175","author":"Yin Lu","year":"2023","unstructured":"Lu Yin, You Wu, Zhenyu Zhang, Cheng-Yu Hsieh, Yaqing Wang, Yiling Jia, Mykola Pechenizkiy, Yi Liang, Zhangyang Wang, and Shiwei Liu. 2023. Outlier weighed layerwise sparsity (owl): A missing secret sauce for pruning llms to high sparsity. arXiv preprint arXiv:2310.05175 (2023)."},{"key":"e_1_3_2_1_128_1","volume-title":"Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, et al.","author":"Zaheer Manzil","year":"2020","unstructured":"Manzil Zaheer, Guru Guruganesh, Kumar Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, et al. 2020. Big bird: Transformers for longer sequences. Advances in neural information processing systems, Vol. 33 (2020), 17283--17297."},{"key":"e_1_3_2_1_129_1","doi-asserted-by":"crossref","unstructured":"J. Zhang J. Wang H. Li L. Shou K. Chen G. Chen and S. Mehrotra. 2023. Draft & verify: Lossless large language model acceleration via self-speculative decoding. arXiv preprint arXiv:2309.08168 (2023).","DOI":"10.18653\/v1\/2024.acl-long.607"},{"key":"e_1_3_2_1_130_1","volume-title":"LoRAPrune: Pruning Meets Low-Rank Parameter-Efficient Fine-Tuning. arXiv preprint arXiv:2305.18403","author":"Zhang Mingyang","year":"2023","unstructured":"Mingyang Zhang, Hao Chen, Chunhua Shen, Zhen Yang, Linlin Ou, Xinyi Yu, and Bohan Zhuang. 2023. LoRAPrune: Pruning Meets Low-Rank Parameter-Efficient Fine-Tuning. arXiv preprint arXiv:2305.18403 (2023)."},{"key":"e_1_3_2_1_131_1","volume-title":"PLATON: Pruning Large Transformer Models with Upper Confidence Bound of Weight Importance. arXiv preprint arXiv:2206.12562","author":"Zhang Qingru","year":"2022","unstructured":"Qingru Zhang, Simiao Zuo, Chen Liang, Alexander Bukharin, Pengcheng He, Weizhu Chen, and Tuo Zhao. 2022. PLATON: Pruning Large Transformer Models with Upper Confidence Bound of Weight Importance. arXiv preprint arXiv:2206.12562 (2022)."},{"key":"e_1_3_2_1_132_1","volume-title":"Plug-and-Play: An Efficient Post-training Pruning Method for Large Language Models. In The Twelfth International Conference on Learning Representations.","author":"Zhang Yingtao","year":"2024","unstructured":"Yingtao Zhang, Haoli Bai, Haokun Lin, Jialin Zhao, Lu Hou, and Carlo Vittorio Cannistraci. 2024. Plug-and-Play: An Efficient Post-training Pruning Method for Large Language Models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_133_1","volume-title":"Moefication: Transformer feed-forward layers are mixtures of experts. arXiv preprint arXiv:2110.01786","author":"Zhang Zhengyan","year":"2021","unstructured":"Zhengyan Zhang, Yankai Lin, Zhiyuan Liu, Peng Li, Maosong Sun, and Jie Zhou. 2021. Moefication: Transformer feed-forward layers are mixtures of experts. arXiv preprint arXiv:2110.01786 (2021)."},{"key":"e_1_3_2_1_134_1","unstructured":"Zhenyu Zhang Ying Sheng Tianyi Zhou Tianlong Chen Lianmin Zheng Ruisi Cai Zhao Song Yuandong Tian Christopher R\u00e9 Clark Barrett Zhangyang Wang and Beidi Chen. 2023. H$_2$O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models. arxiv: 2306.14048"},{"key":"e_1_3_2_1_135_1","volume-title":"Optimizing memory-access patterns for deep learning accelerators. arXiv preprint arXiv:2002.12798","author":"Zheng Hongbin","year":"2020","unstructured":"Hongbin Zheng, Sejong Oh, Huiqing Wang, Preston Briggs, Jiading Gai, Animesh Jain, Yizhi Liu, Rich Heaton, Randy Huang, and Yida Wang. 2020. Optimizing memory-access patterns for deep learning accelerators. arXiv preprint arXiv:2002.12798 (2020)."},{"key":"e_1_3_2_1_136_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al.","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Jeff Huang, Chuyue Sun, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. 2023. Efficiently programming large language models using sglang. arXiv preprint arXiv:2312.07104 (2023)."},{"key":"e_1_3_2_1_137_1","volume-title":"DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. arXiv preprint arXiv:2401.09670","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. arXiv preprint arXiv:2401.09670 (2024)."},{"key":"e_1_3_2_1_138_1","first-page":"7103","article-title":"Mixture-of-experts with expert choice routing","volume":"35","author":"Zhou Yanqi","year":"2022","unstructured":"Yanqi Zhou, Tao Lei, Hanxiao Liu, Nan Du, Yanping Huang, Vincent Zhao, Andrew M Dai, Quoc V Le, James Laudon, et al. 2022. Mixture-of-experts with expert choice routing. Advances in Neural Information Processing Systems, Vol. 35 (2022), 7103--7114.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_139_1","volume-title":"Distillspec: Improving speculative decoding via knowledge distillation. arXiv preprint arXiv:2310.08461","author":"Zhou Y.","year":"2023","unstructured":"Y. Zhou, K. Lyu, A. S. Rawat, A. K. Menon, A. Rostamizadeh, S. Kumar, J. F. Kagy, and R. Agarwal. 2023. Distillspec: Improving speculative decoding via knowledge distillation. arXiv preprint arXiv:2310.08461 (2023)."},{"key":"e_1_3_2_1_140_1","volume-title":"To prune, or not to prune: exploring the efficacy of pruning for model compression. arXiv preprint arXiv:1710.01878","author":"Zhu Michael","year":"2017","unstructured":"Michael Zhu and Suyog Gupta. 2017. To prune, or not to prune: exploring the efficacy of pruning for model compression. arXiv preprint arXiv:1710.01878 (2017)."},{"key":"e_1_3_2_1_141_1","volume-title":"St-moe: Designing stable and transferable sparse expert models. arXiv preprint arXiv:2202.08906","author":"Zoph Barret","year":"2022","unstructured":"Barret Zoph, Irwan Bello, Sameer Kumar, Nan Du, Yanping Huang, Jeff Dean, Noam Shazeer, and William Fedus. 2022. St-moe: Designing stable and transferable sparse expert models. arXiv preprint arXiv:2202.08906 (2022)."}],"event":{"name":"KDD '24: The 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Barcelona Spain","acronym":"KDD '24","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671465","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3637528.3671465","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:26Z","timestamp":1750291406000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671465"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,24]]},"references-count":140,"alternative-id":["10.1145\/3637528.3671465","10.1145\/3637528"],"URL":"https:\/\/doi.org\/10.1145\/3637528.3671465","relation":{},"subject":[],"published":{"date-parts":[[2024,8,24]]},"assertion":[{"value":"2024-08-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}