{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T03:44:23Z","timestamp":1768967063152,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3721146.3721961","type":"proceedings-article","created":{"date-parts":[[2025,4,1]],"date-time":"2025-04-01T17:42:05Z","timestamp":1743529325000},"page":"56-65","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["FlexInfer: Breaking Memory Constraint via Flexible and Efficient Offloading for On-Device LLM Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7995-1834","authenticated-orcid":false,"given":"Hongchao","family":"Du","sequence":"first","affiliation":[{"name":"City University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1961-143X","authenticated-orcid":false,"given":"Shangyu","family":"Wu","sequence":"additional","affiliation":[{"name":"City University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7620-8567","authenticated-orcid":false,"given":"Arina","family":"Kharlamova","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of Artificial Intelligence, Abu Dhabi, United Arab Emirates"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3775-911X","authenticated-orcid":false,"given":"Nan","family":"Guan","sequence":"additional","affiliation":[{"name":"City University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6431-9868","authenticated-orcid":false,"given":"Chun Jason","family":"Xue","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of Artificial Intelligence, Abu Dhabi, United Arab Emirates"}]}],"member":"320","published-online":{"date-parts":[[2025,4]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints. arXiv:2305.13245 [cs.CL] https:\/\/arxiv.org\/abs\/2305.13245","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. 2023. GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints. arXiv:2305.13245 [cs.CL] https:\/\/arxiv.org\/abs\/2305.13245"},{"key":"e_1_3_2_1_2_1","volume-title":"Mohammad Rastegari, and Mehrdad Farajtabar.","author":"Alizadeh Keivan","year":"2024","unstructured":"Keivan Alizadeh, Iman Mirzadeh, Dmitry Belenko, Karen Khatamifard, Minsik Cho, Carlo C Del Mundo, Mohammad Rastegari, and Mehrdad Farajtabar. 2024. LLM in a flash: Efficient Large Language Model Inference with Limited Memory. arXiv:2312.11514 [cs.CL] https:\/\/arxiv.org\/abs\/2312.11514"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_4_1","unstructured":"Lihu Chen and Ga\u00ebl Varoquaux. 2024. What is the Role of Small Models in the LLM Era: A Survey. arXiv:2409.06857 [cs.CL] https:\/\/arxiv.org\/abs\/2409.06857"},{"key":"e_1_3_2_1_5_1","volume-title":"Oh (Eds.)","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In Advances in Neural Information Processing Systems, S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh (Eds.), Vol. 35. Curran Associates, Inc., 16344--16359. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/67d57c32e20fd0a7a302cb81d36e40d5-Paper-Conference.pdf"},{"key":"e_1_3_2_1_6_1","unstructured":"DeepSeek-AI Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi Xiaokang Zhang Xingkai Yu Yu Wu Z. F. Wu Zhibin Gou Zhihong Shao Zhuoshu Li Ziyi Gao Aixin Liu Bing Xue Bingxuan Wang Bochao Wu Bei Feng Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan Damai Dai Deli Chen Dongjie Ji Erhang Li Fangyun Lin Fucong Dai Fuli Luo Guangbo Hao Guanting Chen Guowei Li H. Zhang Han Bao Hanwei Xu Haocheng Wang Honghui Ding Huajian Xin Huazuo Gao Hui Qu Hui Li Jianzhong Guo Jiashi Li Jiawei Wang Jingchang Chen Jingyang Yuan Junjie Qiu Junlong Li J. L. Cai Jiaqi Ni Jian Liang Jin Chen Kai Dong Kai Hu Kaige Gao Kang Guan Kexin Huang Kuai Yu Lean Wang Lecong Zhang Liang Zhao Litong Wang Liyue Zhang Lei Xu Leyi Xia Mingchuan Zhang Minghua Zhang Minghui Tang Meng Li Miaojun Wang Mingming Li Ning Tian Panpan Huang Peng Zhang Qiancheng Wang Qinyu Chen Qiushi Du Ruiqi Ge Ruisong Zhang Ruizhe Pan Runji Wang R. J. Chen R. L. Jin Ruyi Chen Shanghao Lu Shangyan Zhou Shanhuang Chen Shengfeng Ye Shiyu Wang Shuiping Yu Shunfeng Zhou Shuting Pan S. S. Li Shuang Zhou Shaoqing Wu Shengfeng Ye Tao Yun Tian Pei Tianyu Sun T. Wang Wangding Zeng Wanjia Zhao Wen Liu Wenfeng Liang Wenjun Gao Wenqin Yu Wentao Zhang W. L. Xiao Wei An Xiaodong Liu Xiaohan Wang Xiaokang Chen Xiaotao Nie Xin Cheng Xin Liu Xin Xie Xingchao Liu Xinyu Yang Xinyuan Li Xuecheng Su Xuheng Lin X. Q. Li Xiangyue Jin Xiaojin Shen Xiaosha Chen Xiaowen Sun Xiaoxiang Wang Xinnan Song Xinyi Zhou Xianzu Wang Xinxia Shan Y. K. Li Y. Q. Wang Y. X. Wei Yang Zhang Yanhong Xu Yao Li Yao Zhao Yaofeng Sun Yaohui Wang Yi Yu Yichao Zhang Yifan Shi Yiliang Xiong Ying He Yishi Piao Yisong Wang Yixuan Tan Yiyang Ma Yiyuan Liu Yongqiang Guo Yuan Ou Yuduan Wang Yue Gong Yuheng Zou Yujia He Yunfan Xiong Yuxiang Luo Yuxiang You Yuxuan Liu Yuyang Zhou Y. X. Zhu Yanhong Xu Yanping Huang Yaohui Li Yi Zheng Yuchen Zhu Yunxian Ma Ying Tang Yukun Zha Yuting Yan Z. Z. Ren Zehui Ren Zhangli Sha Zhe Fu Zhean Xu Zhenda Xie Zhengyan Zhang Zhewen Hao Zhicheng Ma Zhigang Yan Zhiyu Wu Zihui Gu Zijia Zhu Zijun Liu Zilin Li Ziwei Xie Ziyang Song Zizheng Pan Zhen Huang Zhipeng Xu Zhongyu Zhang and Zhen Zhang. 2025. DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948 [cs.CL] https:\/\/arxiv.org\/abs\/2501.12948"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan Anirudh Goyal Anthony Hartshorn Aobo Yang Archi Mitra Archie Sravankumar Artem Korenev Arthur Hinsvark Arun Rao Aston Zhang Aur\u00e9lien Rodriguez Austen Gregerson Ava Spataru Baptiste Rozi\u00e8re Bethany Biron Binh Tang Bobbie Chern Charlotte Caucheteux Chaya Nayak Chloe Bi Chris Marra Chris McConnell Christian Keller Christophe Touret Chunyang Wu Corinne Wong Cristian Canton Ferrer Cyrus Nikolaidis Damien Allonsius Daniel Song Danielle Pintz Danny Livshits David Esiobu Dhruv Choudhary Dhruv Mahajan Diego Garcia-Olano Diego Perino Dieuwke Hupkes Egor Lakomkin Ehab AlBadawy Elina Lobanova Emily Dinan Eric Michael Smith Filip Radenovic Frank Zhang Gabriel Synnaeve Gabrielle Lee Georgia Lewis Anderson Graeme Nail Gr\u00e9goire Mialon Guan Pang Guillem Cucurell Hailey Nguyen Hannah Korevaar Hu Xu Hugo Touvron Iliyan Zarov Imanol Arrieta Ibarra Isabel M. Kloumann Ishan Misra Ivan Evtimov Jade Copet Jaewon Lee Jan Geffert Jana Vranes Jason Park Jay Mahadeokar Jeet Shah Jelmer van der Linde Jennifer Billock Jenny Hong Jenya Lee Jeremy Fu Jianfeng Chi Jianyu Huang Jiawen Liu Jie Wang Jiecao Yu Joanna Bitton Joe Spisak Jongsoo Park Joseph Rocca Joshua Johnstun Joshua Saxe Junteng Jia Kalyan Vasuden Alwala Kartikeya Upasani Kate Plawiak Ke Li Kenneth Heafield Kevin Stone and et al. 2024. The Llama 3 Herd of Models. CoRR abs\/2407.21783 (2024). arXiv:2407.21783 10.48550\/ARXIV.2407.21783","DOI":"10.48550\/ARXIV.2407.21783"},{"key":"e_1_3_2_1_8_1","unstructured":"Georgi Gerganov. 2024. ggerganov\/llama.cpp: Port of Facebook's LLaMA model in C\/C++. https:\/\/github.com\/ggerganov\/llama.cpp."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1038"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Cheng-Yu Hsieh Chun-Liang Li Chih-Kuan Yeh Hootan Nakhost Yasuhisa Fujii Alexander Ratner Ranjay Krishna Chen-Yu Lee and Tomas Pfister. 2023. Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes. arXiv:2305.02301 [cs.CL] https:\/\/arxiv.org\/abs\/2305.02301","DOI":"10.18653\/v1\/2023.findings-acl.507"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2405.15198"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-024-00070-x"},{"key":"e_1_3_2_1_13_1","unstructured":"HuggingFace. 2022. Hugging face accelerate. https:\/\/huggingface.co\/docs\/accelerate\/index."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of Machine Learning and Systems, I. Dhillon, D. Papailiopoulos, and V. Sze (Eds.)","volume":"2","author":"Jain Paras","year":"2020","unstructured":"Paras Jain, Ajay Jain, Aniruddha Nrusimha, Amir Gholami, Pieter Abbeel, Joseph Gonzalez, Kurt Keutzer, and Ion Stoica. 2020. Checkmate: Breaking the Memory Wall with Optimal Tensor Rematerialization. In Proceedings of Machine Learning and Systems, I. Dhillon, D. Papailiopoulos, and V. Sze (Eds.), Vol. 2. 497--511. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2020\/file\/0b816ae8f06f8dd3543dc3d9ef196cab-Paper.pdf"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126511"},{"key":"e_1_3_2_1_16_1","unstructured":"Jared Kaplan Sam McCandlish Tom Henighan Tom B. Brown Benjamin Chess Rewon Child Scott Gray Alec Radford Jeffrey Wu and Dario Amodei. 2020. Scaling Laws for Neural Language Models. arXiv:2001.08361 [cs.LG] https:\/\/arxiv.org\/abs\/2001.08361"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.5555\/3618408.3619203"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa (Eds.)","volume":"6","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. 2024. AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration. In Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa (Eds.), Vol. 6. 87--100. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2024\/file\/42a452cbafa9dd64e9ba4aa95cc1ef21-Paper-Conference.pdf"},{"key":"e_1_3_2_1_20_1","unstructured":"Linux. 2024. cgroups - Linux control groups. https:\/\/man7.org\/linux\/man-pages\/man7\/cgroups.7.html."},{"key":"e_1_3_2_1_21_1","unstructured":"Linux. 2024. mmap munmap - map or unmap files or devices into memory. https:\/\/man7.org\/linux\/man-pages\/man2\/mmap.2.html."},{"key":"e_1_3_2_1_22_1","unstructured":"Linux. 2024. taskset - set or retrieve a process's CPU affinity. https:\/\/man7.org\/linux\/man-pages\/man1\/taskset.1.html."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"22176","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Jue Wang, Tri Dao, Tianyi Zhou, Binhang Yuan, Zhao Song, Anshumali Shrivastava, Ce Zhang, Yuandong Tian, Christopher Re, and Beidi Chen. 2023. Deja Vu: Contextual Sparsity for Efficient LLMs at Inference Time. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 22137--22176. https:\/\/proceedings.mlr.press\/v202\/liu23am.html"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"22176","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Jue Wang, Tri Dao, Tianyi Zhou, Binhang Yuan, Zhao Song, Anshumali Shrivastava, Ce Zhang, Yuandong Tian, Christopher Re, and Beidi Chen. 2023. Deja Vu: Contextual Sparsity for Efficient LLMs at Inference Time. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 22137--22176. https:\/\/proceedings.mlr.press\/v202\/liu23am.html"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Hanjia Lyu Song Jiang Hanqing Zeng Yinglong Xia Qifan Wang Si Zhang Ren Chen Christopher Leung Jiajie Tang and Jiebo Luo. 2024. LLM-Rec: Personalized Recommendation via Prompting Large Language Models. arXiv:2307.15780 [cs.CL] https:\/\/arxiv.org\/abs\/2307.15780","DOI":"10.18653\/v1\/2024.findings-naacl.39"},{"key":"e_1_3_2_1_26_1","unstructured":"Yu Mao Weilan Wang Hongchao Du Nan Guan and Chun Jason Xue. 2024. On the Compressibility of Quantized Large Language Models. arXiv:2403.01384 [cs.LG] https:\/\/arxiv.org\/abs\/2403.01384"},{"key":"e_1_3_2_1_27_1","unstructured":"NVIDIA. 2023. FasterTransformer. https:\/\/developer.nvidia.com\/nvidia-triton-inference-server."},{"key":"e_1_3_2_1_28_1","unstructured":"OpenAI : Aaron Jaech Adam Kalai Adam Lerer Adam Richardson Ahmed El-Kishky Aiden Low Alec Helyar Aleksander Madry Alex Beutel Alex Carney Alex Iftimie Alex Karpenko Alex Tachard Passos Alexander Neitz Alexander Prokofiev Alexander Wei Allison Tam Ally Bennett Ananya Kumar Andre Saraiva Andrea Vallone Andrew Duberstein Andrew Kondrich Andrey Mishchenko Andy Applebaum Angela Jiang Ashvin Nair Barret Zoph Behrooz Ghorbani Ben Rossen Benjamin Sokolowsky Boaz Barak Bob McGrew Borys Minaiev Botao Hao Bowen Baker Brandon Houghton Brandon McKinzie Brydon Eastman Camillo Lugaresi Cary Bassin Cary Hudson Chak Ming Li Charles de Bourcy Chelsea Voss Chen Shen Chong Zhang Chris Koch Chris Orsinger Christopher Hesse Claudia Fischer Clive Chan Dan Roberts Daniel Kappler Daniel Levy Daniel Selsam David Dohan David Farhi David Mely David Robinson Dimitris Tsipras Doug Li Dragos Oprica Eben Freeman Eddie Zhang Edmund Wong Elizabeth Proehl Enoch Cheung Eric Mitchell Eric Wallace Erik Ritter Evan Mays Fan Wang Felipe Petroski Such Filippo Raso Florencia Leoni Foivos Tsimpourlas Francis Song Fred von Lohmann Freddie Sulit Geoff Salmon Giambattista Parascandolo Gildas Chabot Grace Zhao Greg Brockman Guillaume Leclerc Hadi Salman Haiming Bao Hao Sheng Hart Andrin Hessam Bagherinezhad Hongyu Ren Hunter Lightman Hyung Won Chung Ian Kivlichan Ian O'Connell Ian Osband Ignasi Clavera Gilaberte Ilge Akkaya Ilya Kostrikov Ilya Sutskever Irina Kofman Jakub Pachocki James Lennon Jason Wei Jean Harb Jerry Twore Jiacheng Feng Jiahui Yu Jiayi Weng Jie Tang Jieqi Yu Joaquin Qui\u00f1onero Candela Joe Palermo Joel Parish Johannes Heidecke John Hallman John Rizzo Jonathan Gordon Jonathan Uesato Jonathan Ward Joost Huizinga Julie Wang Kai Chen Kai Xiao Karan Singhal Karina Nguyen Karl Cobbe Katy Shi Kayla Wood Kendra Rimbach Keren Gu-Lemberg Kevin Liu Kevin Lu Kevin Stone Kevin Yu Lama Ahmad Lauren Yang Leo Liu Leon Maksin Leyton Ho Liam Fedus Lilian Weng Linden Li Lindsay McCallum Lindsey Held Lorenz Kuhn Lukas Kondraciuk Lukasz Kaiser Luke Metz Madelaine Boyd Maja Trebacz Manas Joglekar Mark Chen Marko Tintor Mason Meyer Matt Jones Matt Kaufer Max Schwarzer Meghan Shah Mehmet Yatbaz Melody Y. Guan Mengyuan Xu Mengyuan Yan Mia Glaese Mianna Chen Michael Lampe Michael Malek Michele Wang Michelle Fradin Mike McClay Mikhail Pavlov Miles Wang Mingxuan Wang Mira Murati Mo Bavarian Mostafa Rohaninejad Nat McAleese Neil Chowdhury Neil Chowdhury Nick Ryder Nikolas Tezak Noam Brown Ofir Nachum Oleg Boiko Oleg Murk Olivia Watkins Patrick Chao Paul Ashbourne Pavel Izmailov Peter Zhokhov Rachel Dias Rahul Arora Randall Lin Rapha Gontijo Lopes Raz Gaon Reah Miyara Reimar Leike Renny Hwang Rhythm Garg Robin Brown Roshan James Rui Shu Ryan Cheu Ryan Greene Saachi Jain Sam Altman Sam Toizer Sam Toyer Samuel Miserendino Sandhini Agarwal Santiago Hernandez Sasha Baker Scott McKinney Scottie Yan Shengjia Zhao Shengli Hu Shibani Santurkar Shraman Ray Chaudhuri Shuyuan Zhang Siyuan Fu Spencer Papay Steph Lin Suchir Balaji Suvansh Sanjeev Szymon Sidor Tal Broda Aidan Clark Tao Wang Taylor Gordon Ted Sanders Tejal Patwardhan Thibault Sottiaux Thomas Degry Thomas Dimson Tianhao Zheng Timur Garipov Tom Stasi Trapit Bansal Trevor Creech Troy Peterson Tyna Eloundou Valerie Qi Vineet Kosaraju Vinnie Monaco Vitchyr Pong Vlad Fomenko Weiyi Zheng Wenda Zhou Wes McCabe Wojciech Zaremba Yann Dubois Yinghai Lu Yining Chen Young Cha Yu Bai Yuchen He Yuchen Zhang Yunyun Wang Zheng Shao and Zhuohan Li. 2024. OpenAI o1 System Card. arXiv:2412.16720 [cs.AI] https:\/\/arxiv.org\/abs\/2412.16720"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the 39th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"17583","author":"Patil Shishir G.","year":"2022","unstructured":"Shishir G. Patil, Paras Jain, Prabal Dutta, Ion Stoica, and Joseph Gonzalez. 2022. POET: Training Neural Networks on Tiny Devices with Integrated Rematerialization and Paging. In Proceedings of the 39th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 162), Kamalika Chaudhuri, Stefanie Jegelka, Le Song, Csaba Szepesvari, Gang Niu, and Sivan Sabato (Eds.). PMLR, 17573--17583. https:\/\/proceedings.mlr.press\/v162\/patil22b.html"},{"key":"e_1_3_2_1_30_1","unstructured":"Reiner Pope Sholto Douglas Aakanksha Chowdhery Jacob Devlin James Bradbury Anselm Levskaya Jonathan Heek Kefan Xiao Shivani Agrawal and Jeff Dean. 2022. Efficiently Scaling Transformer Inference. arXiv:2211.05102 [cs.LG] https:\/\/arxiv.org\/abs\/2211.05102"},{"key":"e_1_3_2_1_31_1","unstructured":"PrivateGPT 2023. PrivateGPT. https:\/\/github.com\/zylon-ai\/private-gpt."},{"key":"e_1_3_2_1_32_1","volume-title":"ZeRO-Offload: Democratizing Billion-Scale Model Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. ZeRO-Offload: Democratizing Billion-Scale Model Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 551--564. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/ren-jie"},{"key":"e_1_3_2_1_33_1","volume-title":"International Conference on Machine Learning, ICML 2023","volume":"31116","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. In International Conference on Machine Learning, ICML 2023, 23--29 July 2023, Honolulu, Hawaii, USA (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 31094--31116. https:\/\/proceedings.mlr.press\/v202\/sheng23a.html"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"31116","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher Re, Ion Stoica, and Ce Zhang. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 31094--31116. https:\/\/proceedings.mlr.press\/v202\/sheng23a.html"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695964"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2302.13971"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton-Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aur\u00e9lien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. CoRR abs\/2307.09288 (2023). arXiv:2307.09288 10.48550\/ARXIV.2307.09288","DOI":"10.48550\/ARXIV.2307.09288"},{"key":"e_1_3_2_1_38_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4--9, 2017, Long Beach, CA, USA, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna M. Wallach, Rob Fergus, S. V. N. Vishwanathan, and Roman Garnett (Eds.). 5998--6008. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3200691.3178491"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.988"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-industry.15"},{"key":"e_1_3_2_1_42_1","volume-title":"Nan Guan, and Chun Jason Xue.","author":"Wu Shangyu","year":"2025","unstructured":"Shangyu Wu, Hongchao Du, Ying Xiong, Shuai Chen, Tei wei Kuo, Nan Guan, and Chun Jason Xue. 2025. EvoP: Robust LLM Inference via Evolutionary Pruning. arXiv:2502.14910 [cs.CL] https:\/\/arxiv.org\/abs\/2502.14910"},{"key":"e_1_3_2_1_43_1","unstructured":"Haojun Xia Zhen Zheng Yuchao Li Donglin Zhuang Zhongzhu Zhou Xiafei Qiu Yong Li Wei Lin and Shuaiwen Leon Song. 2023. Flash-LLM: Enabling Cost-Effective and Highly-Efficient Large Generative Model Inference with Unstructured Sparsity. arXiv:2309.10285 [cs.DC] https:\/\/arxiv.org\/abs\/2309.10285"},{"key":"e_1_3_2_1_44_1","unstructured":"Daliang Xu Wangsong Yin Xin Jin Ying Zhang Shiyun Wei Mengwei Xu and Xuanzhe Liu. 2023. LLMCad: Fast and Scalable On-device Large Language Model Inference. arXiv:2309.04255 [cs.NI] https:\/\/arxiv.org\/abs\/2309.04255"},{"key":"e_1_3_2_1_45_1","unstructured":"Daliang Xu Wangsong Yin Xin Jin Ying Zhang Shiyun Wei Mengwei Xu and Xuanzhe Liu. 2023. LLMCad: Fast and Scalable On-device Large Language Model Inference. arXiv:2309.04255 [cs.NI] https:\/\/arxiv.org\/abs\/2309.04255"},{"key":"e_1_3_2_1_46_1","unstructured":"Yuzhuang Xu Xu Han Zonghan Yang Shuo Wang Qingfu Zhu Zhiyuan Liu Weidong Liu and Wanxiang Che. 2024. OneBit: Towards Extremely Low-bit Large Language Models. arXiv:2402.11295 [cs.CL] https:\/\/arxiv.org\/abs\/2402.11295"},{"key":"e_1_3_2_1_47_1","unstructured":"Zhenliang Xue Yixin Song Zeyu Mi Xinrui Zheng Yubin Xia and Haibo Chen. 2024. PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv:2406.06282 [cs.LG] https:\/\/arxiv.org\/abs\/2406.06282"},{"key":"e_1_3_2_1_48_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 521--538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_2_1_49_1","unstructured":"Peiyuan Zhang Guangtao Zeng Tianduo Wang and Wei Lu. 2024. TinyLlama: An Open-Source Small Language Model. arXiv:2401.02385 [cs.CL] https:\/\/arxiv.org\/abs\/2401.02385"},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa (Eds.)","volume":"6","author":"Zhao Yilong","year":"2024","unstructured":"Yilong Zhao, Chien-Yu Lin, Kan Zhu, Zihao Ye, Lequn Chen, Size Zheng, Luis Ceze, Arvind Krishnamurthy, Tianqi Chen, and Baris Kasikci. 2024. Atom: Low-Bit Quantization for Efficient and Accurate LLM Serving. In Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa (Eds.), Vol. 6. 196--209. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2024\/file\/5edb57c05c81d04beb716ef1d542fe9e-Paper-Conference.pdf"},{"key":"e_1_3_2_1_51_1","volume-title":"PetS: A Unified Framework for Parameter-Efficient Transformers Serving. In 2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Zhou Zhe","year":"2022","unstructured":"Zhe Zhou, Xuechao Wei, Jiejing Zhang, and Guangyu Sun. 2022. PetS: A Unified Framework for Parameter-Efficient Transformers Serving. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). USENIX Association, Carlsbad, CA, 489--504. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/zhou-zhe"}],"event":{"name":"EuroMLSys '25: 5th Workshop on Machine Learning and Systems","location":"World Trade Center Rotterdam Netherlands","acronym":"EuroMLSys '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 5th Workshop on Machine Learning and Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721146.3721961","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721146.3721961","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:57:39Z","timestamp":1750298259000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721146.3721961"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":51,"alternative-id":["10.1145\/3721146.3721961","10.1145\/3721146"],"URL":"https:\/\/doi.org\/10.1145\/3721146.3721961","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-04-01","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}