{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T13:14:03Z","timestamp":1780060443423,"version":"3.54.0"},"publisher-location":"New York, NY, USA","reference-count":86,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T00:00:00Z","timestamp":1781913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,21]]},"DOI":"10.1145\/3745756.3809234","type":"proceedings-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:52:21Z","timestamp":1780059141000},"page":"723-736","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["KVSwap: Disk-aware KV Cache Offloading for Long-Context On-device Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1021-3184","authenticated-orcid":false,"given":"Huawei","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Leeds, Leeds, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2014-5453","authenticated-orcid":false,"given":"Chunwei","family":"Xia","sequence":"additional","affiliation":[{"name":"University of Leeds, Leeds, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6157-0662","authenticated-orcid":false,"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Leeds, Leeds, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,20]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"NVIDIA 2024. Jetson-Orin. NVIDIA. https:\/\/www.nvidia.com\/en-us\/autonomous-machines\/embedded-systems\/jetson-orin\/"},{"key":"e_1_3_2_2_2_1","unstructured":"Georgi Gerganov 2024. llama.cpp. Georgi Gerganov. https:\/\/github.com\/ggerganov\/llama.cpp"},{"key":"e_1_3_2_2_3_1","unstructured":"MediaTek 2025. MediaTek Dimensity 9400 Plus. MediaTek. https:\/\/www.mediatek.com\/products\/smartphones\/mediatek-dimensity-9400-plus"},{"key":"e_1_3_2_2_4_1","unstructured":"MLC Team 2024. MLC-LLM. MLC Team. https:\/\/github.com\/mlc-ai\/mlc-llm"},{"key":"e_1_3_2_2_5_1","unstructured":"NVIDIA 2025. NVIDIA Nsight Systems. NVIDIA. https:\/\/developer.nvidia.com\/nsight-systems"},{"key":"e_1_3_2_2_6_1","unstructured":"NVIDIA 2025. NVIDIA Tools Extension Library. NVIDIA. https:\/\/github.com\/NVIDIA\/NVTX"},{"key":"e_1_3_2_2_7_1","unstructured":"Orange Pi 2025. Orange Pi 5 Pro. Orange Pi. http:\/\/www.orangepi.org\/html\/hardWare\/computerAndMicrocontrollers\/details\/Orange-Pi-5-Pro.html"},{"key":"e_1_3_2_2_8_1","unstructured":"Ultralytics 2025. Ultralytics YOLO. Ultralytics. https:\/\/github.com\/ultralytics\/ultralytics"},{"key":"e_1_3_2_2_9_1","unstructured":"Adobe Inc. 2025. Adobe Acrobat AI Assistant: Generative AI Tool for PDF Summarization and Smart Q&A. https:\/\/www.adobe.com\/acrobat\/generative-ai-pdf.html."},{"key":"e_1_3_2_2_10_1","volume-title":"Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. 2023. Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245 (2023)."},{"key":"e_1_3_2_2_11_1","volume-title":"Mohammad Rastegari, and Mehrdad Farajtabar.","author":"Alizadeh Keivan","year":"2023","unstructured":"Keivan Alizadeh, Iman Mirzadeh, Dmitry Belenko, Karen Khatamifard, Minsik Cho, Carlo C Del Mundo, Mohammad Rastegari, and Mehrdad Farajtabar. 2023. Llm in a flash: Efficient large language model inference with limited memory. arXiv preprint arXiv:2312.11514 (2023)."},{"key":"e_1_3_2_2_12_1","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"Anagnostidis Sotiris","year":"2023","unstructured":"Sotiris Anagnostidis, Dario Pavllo, Luca Biggio, Lorenzo Noci, Aurelien Lucchi, and Thomas Hofmann. 2023. Dynamic context pruning for efficient and interpretable autoregressive transformers. In Proceedings of the 37th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS '23). Curran Associates Inc., Red Hook, NY, USA, Article 2845, 22 pages."},{"key":"e_1_3_2_2_13_1","unstructured":"Apple Inc. 2025. Apple Intelligence. https:\/\/apple.com\/apple-intelligence."},{"key":"e_1_3_2_2_14_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_2_15_1","volume-title":"Longbench: A bilingual, multitask benchmark for long context understanding. arXiv preprint arXiv:2308.14508","author":"Bai Yushi","year":"2023","unstructured":"Yushi Bai, Xin Lv, Jiajie Zhang, Hongchang Lyu, Jiankai Tang, Zhidian Huang, Zhengxiao Du, Xiao Liu, Aohan Zeng, Lei Hou, et al. 2023. Longbench: A bilingual, multitask benchmark for long context understanding. arXiv preprint arXiv:2308.14508 (2023)."},{"key":"e_1_3_2_2_16_1","volume-title":"Longwriter: Unleashing 10,000+ word generation from long context llms. arXiv preprint arXiv:2408.07055","author":"Bai Yushi","year":"2024","unstructured":"Yushi Bai, Jiajie Zhang, Xin Lv, Linzhi Zheng, Siqi Zhu, Lei Hou, Yuxiao Dong, Jie Tang, and Juanzi Li. 2024. Longwriter: Unleashing 10,000+ word generation from long context llms. arXiv preprint arXiv:2408.07055 (2024)."},{"key":"e_1_3_2_2_17_1","volume-title":"Longformer: The Long-Document Transformer. CoRR abs\/2004.05150","author":"Beltagy Iz","year":"2020","unstructured":"Iz Beltagy, Matthew E. Peters, and Arman Cohan. 2020. Longformer: The Long-Document Transformer. CoRR abs\/2004.05150 (2020). arXiv:2004.05150 https:\/\/arxiv.org\/abs\/2004.05150"},{"key":"e_1_3_2_2_18_1","volume-title":"Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems","author":"Chen Renze","year":"2024","unstructured":"Renze Chen, Zhuofeng Wang, Beiquan Cao, Tong Wu, Size Zheng, Xiuhong Li, Xuechao Wei, Shengen Yan, Meng Li, and Yun Liang. 2024. Ark-Vale: Efficient Generative LLM Inference with Recallable Key-Value Eviction. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024, Amir Globersons, Lester Mackey, Danielle Belgrave, Angela Fan, Ulrich Paquet, Jakub M. Tomczak, and Cheng Zhang (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2024\/hash\/cd4b49379efac6e84186a3ffce108c37-Abstract-Conference.html"},{"key":"e_1_3_2_2_19_1","volume-title":"23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Chen Weijian","year":"2025","unstructured":"Weijian Chen, Shuibing He, Haoyang Qu, Ruidong Zhang, Siling Yang, Ping Chen, Yi Zheng, Baoxing Huai, and Gang Chen. 2025. {IMPRESS}: An {Importance-Informed} {Multi-Tier} Prefix {KV} Storage System for Large Language Model Inference. In 23rd USENIX Conference on File and Storage Technologies (FAST 25). 187\u2013201."},{"key":"e_1_3_2_2_20_1","volume-title":"Generating Long Sequences with Sparse Transformers. CoRR abs\/1904.10509","author":"Child Rewon","year":"2019","unstructured":"Rewon Child, Scott Gray, Alec Radford, and Ilya Sutskever. 2019. Generating Long Sequences with Sparse Transformers. CoRR abs\/1904.10509 (2019). arXiv:1904.10509 http:\/\/arxiv.org\/abs\/1904.10509"},{"key":"e_1_3_2_2_21_1","unstructured":"Jiayu Ding Shuming Ma Li Dong Xingxing Zhang Shaohan Huang Wenhui Wang Nanning Zheng and Furu Wei. 2023. LongNet: Scaling Transformers to 1 000 000 000 Tokens. arXiv:2307.02486 [cs.CL] https:\/\/arxiv.org\/abs\/2307.02486"},{"key":"e_1_3_2_2_22_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF02288367"},{"key":"e_1_3_2_2_24_1","unstructured":"Fireflies.ai Corp. 2025. Fireflies.ai: AI Meeting Assistant for transcription summaries highlights and search. https:\/\/fireflies.ai\/."},{"key":"e_1_3_2_2_25_1","volume-title":"International conference on machine learning. PMLR, 10323\u201310337","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar and Dan Alistarh. 2023. Sparsegpt: Massive language models can be accurately pruned in one-shot. In International conference on machine learning. PMLR, 10323\u201310337."},{"key":"e_1_3_2_2_26_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_2_27_1","unstructured":"Google. 2025. Google Gemini in Gmail: AI-powered Email Summaries and Natural-Language Q&A. https:\/\/blog.google\/products\/gmail\/."},{"key":"e_1_3_2_2_28_1","volume-title":"Google Lens: Natural-language video and voice search. https:\/\/lens.google\/.","year":"2025","unstructured":"Google. 2025. Google Lens: Natural-language video and voice search. https:\/\/lens.google\/."},{"key":"e_1_3_2_2_29_1","volume-title":"Google Recorder: On-device audio transcription and summarization. https:\/\/recorder.google.com\/.","year":"2025","unstructured":"Google. 2025. Google Recorder: On-device audio transcription and summarization. https:\/\/recorder.google.com\/."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.14778\/3598581.3598584"},{"key":"e_1_3_2_2_31_1","volume-title":"RULER: What's the Real Context Size of Your Long-Context Language Models? arXiv preprint arXiv:2404.06654","author":"Hsieh Cheng-Ping","year":"2024","unstructured":"Cheng-Ping Hsieh, Simeng Sun, Samuel Kriman, Shantanu Acharya, Dima Rekesh, Fei Jia, Yang Zhang, and Boris Ginsburg. 2024. RULER: What's the Real Context Size of Your Long-Context Language Models? arXiv preprint arXiv:2404.06654 (2024)."},{"key":"e_1_3_2_2_32_1","volume-title":"Language model compression with weighted low-rank factorization. arXiv preprint arXiv:2207.00112","author":"Hsu Yen-Chang","year":"2022","unstructured":"Yen-Chang Hsu, Ting Hua, Sungen Chang, Qian Lou, Yilin Shen, and Hongxia Jin. 2022. Language model compression with weighted low-rank factorization. arXiv preprint arXiv:2207.00112 (2022)."},{"key":"e_1_3_2_2_33_1","volume-title":"Livecodebench: Holistic and contamination free evaluation of large language models for code. arXiv preprint arXiv:2403.07974","author":"Jain Naman","year":"2024","unstructured":"Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida Wang, Armando Solar-Lezama, Koushik Sen, and Ion Stoica. 2024. Livecodebench: Holistic and contamination free evaluation of large language models for code. arXiv preprint arXiv:2403.07974 (2024)."},{"key":"e_1_3_2_2_34_1","first-page":"52481","article-title":"Minference 1.0: Accelerating pre-filling for long-context llms via dynamic sparse attention","volume":"37","author":"Jiang Huiqiang","year":"2024","unstructured":"Huiqiang Jiang, Yucheng Li, Chengruidong Zhang, Qianhui Wu, Xufang Luo, Surin Ahn, Zhenhua Han, Amir H Abdi, Dongsheng Li, Chin-Yew Lin, et al. 2024. Minference 1.0: Accelerating pre-filling for long-context llms via dynamic sparse attention. Advances in Neural Information Processing Systems 37 (2024), 52481\u201352515.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_35_1","unstructured":"Shibo Jie Yehui Tang Kai Han Zhi-Hong Deng and Jing Han. 2025. SpeCache: Speculative Key-Value Caching for Efficient Generation of LLMs. arXiv:2503.16163 [cs.CL] https:\/\/arxiv.org\/abs\/2503.16163"},{"key":"e_1_3_2_2_36_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Jin Hongye","year":"2024","unstructured":"Hongye Jin, Xiaotian Han, Jingfeng Yang, Zhimeng Jiang, Zirui Liu, Chia-Yuan Chang, Huiyuan Chen, and Xia Hu. 2024. LLM Maybe LongLM: SelfExtend LLM context window without tuning. In Proceedings of the 41st International Conference on Machine Learning (Vienna, Austria) (ICML'24). JMLR.org, Article 888, 16 pages."},{"key":"e_1_3_2_2_37_1","unstructured":"Greg Kamradt. 2023. Needle in a Haystack - Pressure Testing LLMs. https:\/\/github.com\/gkamradt\/LLMTest_NeedleInAHaystack"},{"key":"e_1_3_2_2_38_1","volume-title":"Reformer: The Efficient Transformer. In 8th International Conference on Learning Representations, ICLR 2020","author":"Kitaev Nikita","year":"2020","unstructured":"Nikita Kitaev, Lukasz Kaiser, and Anselm Levskaya. 2020. Reformer: The Efficient Transformer. In 8th International Conference on Learning Representations, ICLR 2020, Addis Ababa, Ethiopia, April 26\u201330, 2020. OpenReview.net. https:\/\/openreview.net\/forum?id=rkgNKkHtvB"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_2_40_1","volume-title":"Building real-world meeting summarization systems using large language models: A practical perspective. arXiv preprint arXiv:2310.19233","author":"Rahman Laskar Md Tahmid","year":"2023","unstructured":"Md Tahmid Rahman Laskar, Xue-Yong Fu, Cheng Chen, and Shashi Bhushan Tn. 2023. Building real-world meeting summarization systems using large language models: A practical perspective. arXiv preprint arXiv:2310.19233 (2023)."},{"key":"e_1_3_2_2_41_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Lee Wonbeom","year":"2024","unstructured":"Wonbeom Lee, Jungi Lee, Junghwan Seo, and Jaewoong Sim. 2024. {InfiniGen}: Efficient generative inference of large language models with dynamic {KV} cache management. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 155\u2013172."},{"key":"e_1_3_2_2_42_1","first-page":"22947","article-title":"Snapkv: Llm knows what you are looking for before generation","volume":"37","author":"Li Yuhong","year":"2024","unstructured":"Yuhong Li, Yingbing Huang, Bowen Yang, Bharat Venkitesh, Acyr Locatelli, Hanchen Ye, Tianle Cai, Patrick Lewis, and Deming Chen. 2024. Snapkv: Llm knows what you are looking for before generation. Advances in Neural Information Processing Systems 37 (2024), 22947\u201322970.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_43_1","unstructured":"Hunter Lightman Vineet Kosaraju Yuri Burda Harrison Edwards Bowen Baker Teddy Lee Jan Leike John Schulman Ilya Sutskever and Karl Cobbe. 2023. Let's verify step by step. In The twelfth international conference on learning representations."},{"key":"e_1_3_2_2_44_1","first-page":"87","article-title":"AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration","volume":"6","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. 2024. AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration. Proceedings of Machine Learning and Systems 6 (2024), 87\u2013100.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_2_45_1","volume-title":"Qserve: W4a8kv4 quantization and system co-design for efficient llm serving. arXiv preprint arXiv:2405.04532","author":"Lin Yujun","year":"2024","unstructured":"Yujun Lin, Haotian Tang, Shang Yang, Zhekai Zhang, Guangxuan Xiao, Chuang Gan, and Song Han. 2024. Qserve: W4a8kv4 quantization and system co-design for efficient llm serving. arXiv preprint arXiv:2405.04532 (2024)."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672274"},{"key":"e_1_3_2_2_47_1","volume-title":"International Conference on Machine Learning. PMLR, 22137\u201322176","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Jue Wang, Tri Dao, Tianyi Zhou, Binhang Yuan, Zhao Song, Anshumali Shrivastava, Ce Zhang, Yuandong Tian, Christopher Re, et al. 2023. Deja vu: Contextual sparsity for efficient llms at inference time. In International Conference on Machine Learning. PMLR, 22137\u201322176."},{"key":"e_1_3_2_2_48_1","volume-title":"Kivi: A tuning-free asymmetric 2bit quantization for kv cache. arXiv preprint arXiv:2402.02750","author":"Liu Zirui","year":"2024","unstructured":"Zirui Liu, Jiayi Yuan, Hongye Jin, Shaochen Zhong, Zhaozhuo Xu, Vladimir Braverman, Beidi Chen, and Xia Hu. 2024. Kivi: A tuning-free asymmetric 2bit quantization for kv cache. arXiv preprint arXiv:2402.02750 (2024)."},{"key":"e_1_3_2_2_49_1","volume-title":"Mobilellm: Optimizing sub-billion parameter language models for on-device use cases. arXiv preprint arXiv:2402.14905","author":"Liu Zechun","year":"2024","unstructured":"Zechun Liu, Changsheng Zhao, Forrest Iandola, Chen Lai, Yuandong Tian, Igor Fedorov, Yunyang Xiong, Ernie Chang, Yangyang Shi, Raghuraman Krishnamoorthi, et al. 2024. Mobilellm: Optimizing sub-billion parameter language models for on-device use cases. arXiv preprint arXiv:2402.14905 (2024)."},{"key":"e_1_3_2_2_50_1","volume-title":"Workshop on Efficient Systems for Foundation Models II@ ICML2024","author":"Mehta Sachin","year":"2024","unstructured":"Sachin Mehta, Mohammad Hossein Sekhavat, Qingqing Cao, Maxwell Horton, Yanzi Jin, Chenfan Sun, Seyed Iman Mirzadeh, Mahyar Najibi, Dmitry Belenko, Peter Zatloukal, et al. 2024. Openelm: An efficient language model family with open training and inference framework. In Workshop on Efficient Systems for Foundation Models II@ ICML2024."},{"key":"e_1_3_2_2_51_1","volume-title":"Pointer sentinel mixture models. arXiv preprint arXiv:1609.07843","author":"Merity Stephen","year":"2016","unstructured":"Stephen Merity, Caiming Xiong, James Bradbury, and Richard Socher. 2016. Pointer sentinel mixture models. arXiv preprint arXiv:1609.07843 (2016)."},{"key":"e_1_3_2_2_52_1","volume-title":"Analyzing IO amplification in Linux file systems. arXiv preprint arXiv:1707.08514","author":"Mohan Jayashree","year":"2017","unstructured":"Jayashree Mohan, Rohan Kadekodi, and Vijay Chidambaram. 2017. Analyzing IO amplification in Linux file systems. arXiv preprint arXiv:1707.08514 (2017)."},{"key":"e_1_3_2_2_53_1","unstructured":"Otter.ai Inc. 2025. Otter.ai: AI Meeting Agent for real-time transcription summarization and action-item extraction. https:\/\/otter.ai\/."},{"key":"e_1_3_2_2_54_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_2_55_1","volume-title":"Proceedings of machine learning and systems 5","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2023. Efficiently scaling transformer inference. Proceedings of machine learning and systems 5 (2023), 606\u2013624."},{"key":"e_1_3_2_2_56_1","unstructured":"Qualcomm Technologies Inc. 2024. Snapdragon 8 Elite Mobile Platform. https:\/\/www.qualcomm.com\/products\/mobile\/snapdragon\/smartphones\/snapdragon-8-series-mobile-platforms\/snapdragon-8-elite-mobile-platform."},{"key":"e_1_3_2_2_57_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research 21, 140 (2020), 1\u201367.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_2_58_1","volume-title":"International Conference on Machine Learning. PMLR, 31094\u201331116","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. 2023. Flexgen: High-throughput generative inference of large language models with a single gpu. In International Conference on Machine Learning. PMLR, 31094\u201331116."},{"key":"e_1_3_2_2_59_1","volume-title":"Loki: Low-rank keys for efficient sparse attention. arXiv preprint arXiv:2406.02542","author":"Singhania Prajwal","year":"2024","unstructured":"Prajwal Singhania, Siddharth Singh, Shwai He, Soheil Feizi, and Abhinav Bhatele. 2024. Loki: Low-rank keys for efficient sparse attention. arXiv preprint arXiv:2406.02542 (2024)."},{"key":"e_1_3_2_2_60_1","volume-title":"Shadowkv: Kv cache in shadows for high-throughput long-context llm inference. arXiv preprint arXiv:2410.21465","author":"Sun Hanshi","year":"2024","unstructured":"Hanshi Sun, Li-Wen Chang, Wenlei Bao, Size Zheng, Ningxin Zheng, Xin Liu, Harry Dong, Yuejie Chi, and Beidi Chen. 2024. Shadowkv: Kv cache in shadows for high-throughput long-context llm inference. arXiv preprint arXiv:2410.21465 (2024)."},{"key":"e_1_3_2_2_61_1","volume-title":"Quest: Query-aware sparsity for efficient long-context llm inference. arXiv preprint arXiv:2406.10774","author":"Tang Jiaming","year":"2024","unstructured":"Jiaming Tang, Yilong Zhao, Kan Zhu, Guangxuan Xiao, Baris Kasikci, and Song Han. 2024. Quest: Query-aware sparsity for efficient long-context llm inference. arXiv preprint arXiv:2406.10774 (2024)."},{"key":"e_1_3_2_2_62_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Tang Jiaming","year":"2024","unstructured":"Jiaming Tang, Yilong Zhao, Kan Zhu, Guangxuan Xiao, Baris Kasikci, and Song Han. 2024. QUEST: query-aware sparsity for efficient long-context LLM inference. In Proceedings of the 41st International Conference on Machine Learning (Vienna, Austria) (ICML'24). JMLR.org, Article 1955, 11 pages."},{"key":"e_1_3_2_2_63_1","volume-title":"Cassidy Hardin, Surya Bhupatiraju, L\u00e9onard Hussenot, Thomas Mesnard, Bobak Shahriari, Alexandre Ram\u00e9, et al.","author":"Team Gemma","year":"2024","unstructured":"Gemma Team, Morgane Riviere, Shreya Pathak, Pier Giuseppe Sessa, Cassidy Hardin, Surya Bhupatiraju, L\u00e9onard Hussenot, Thomas Mesnard, Bobak Shahriari, Alexandre Ram\u00e9, et al. 2024. Gemma 2: Improving open language models at a practical size. arXiv preprint arXiv:2408.00118 (2024)."},{"key":"e_1_3_2_2_64_1","volume-title":"Mobillama: Towards accurate and lightweight fully transparent gpt. arXiv preprint arXiv:2402.16840","author":"Thawakar Omkar","year":"2024","unstructured":"Omkar Thawakar, Ashmal Vayani, Salman Khan, Hisham Cholakal, Rao M Anwer, Michael Felsberg, Tim Baldwin, Eric P Xing, and Fahad Shahbaz Khan. 2024. Mobillama: Towards accurate and lightweight fully transparent gpt. arXiv preprint arXiv:2402.16840 (2024)."},{"key":"e_1_3_2_2_65_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"e_1_3_2_2_67_1","volume-title":"Svd-llm: Truncation-aware singular value decomposition for large language model compression. arXiv preprint arXiv:2403.07378","author":"Wang Xin","year":"2024","unstructured":"Xin Wang, Yu Zheng, Zhongwei Wan, and Mi Zhang. 2024. Svd-llm: Truncation-aware singular value decomposition for large language model compression. arXiv preprint arXiv:2403.07378 (2024)."},{"key":"e_1_3_2_2_68_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems 35 (2022), 24824\u201324837."},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","unstructured":"Jianbo Wu Jie Ren Shuangyan Yang Konstantinos Parasyris Giorgis Georgakoudis Ignacio Laguna and Dong Li. 2025. LM-Offload: Performance Model-Guided Generative Inference of Large Language Models with Parallelism Control. In 2025 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW). 840\u2013849. 10.1109\/IPDPSW66978.2025.00134","DOI":"10.1109\/IPDPSW66978.2025.00134"},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"crossref","unstructured":"Wei Wu Zhuoshi Pan Chao Wang Liyi Chen Yunchu Bai Tianfu Wang Kun Fu Zheng Wang and Hui Xiong. 2025. TokenSelect: Efficient Long-Context Inference and Length Extrapolation for LLMs via Dynamic Token-Level KV Cache Selection. arXiv:2411.02886 [cs.CL] https:\/\/arxiv.org\/abs\/2411.02886","DOI":"10.18653\/v1\/2025.emnlp-main.1079"},{"key":"e_1_3_2_2_71_1","volume-title":"International Conference on Machine Learning. PMLR, 38087\u201338099","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning. PMLR, 38087\u201338099."},{"key":"e_1_3_2_2_72_1","volume-title":"Llmcad: Fast and scalable on-device large language model inference. arXiv preprint arXiv:2309.04255","author":"Xu Daliang","year":"2023","unstructured":"Daliang Xu, Wangsong Yin, Xin Jin, Ying Zhang, Shiyun Wei, Mengwei Xu, and Xuanzhe Liu. 2023. Llmcad: Fast and scalable on-device large language model inference. arXiv preprint arXiv:2309.04255 (2023)."},{"key":"e_1_3_2_2_73_1","volume-title":"PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv preprint arXiv:2406.06282","author":"Xue Zhenliang","year":"2024","unstructured":"Zhenliang Xue, Yixin Song, Zeyu Mi, Le Chen, Yubin Xia, and Haibo Chen. 2024. PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv preprint arXiv:2406.06282 (2024)."},{"key":"e_1_3_2_2_74_1","unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv et al. 2025. Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025)."},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.5555\/3692070.3694403"},{"key":"e_1_3_2_2_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696098"},{"key":"e_1_3_2_2_77_1","volume-title":"Edgemoe: Fast on-device inference of moe-based large language models. arXiv preprint arXiv:2308.14352","author":"Yi Rongjie","year":"2023","unstructured":"Rongjie Yi, Liwei Guo, Shiyun Wei, Ao Zhou, Shangguang Wang, and Mengwei Xu. 2023. Edgemoe: Fast on-device inference of moe-based large language models. arXiv preprint arXiv:2308.14352 (2023)."},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwae403"},{"key":"e_1_3_2_2_79_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1126"},{"key":"e_1_3_2_2_80_1","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"Zaheer Manzil","year":"2020","unstructured":"Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, and Amr Ahmed. 2020. Big bird: transformers for longer sequences. In Proceedings of the 34th International Conference on Neural Information Processing Systems (Vancouver, BC, Canada) (NIPS '20). Curran Associates Inc., Red Hook, NY, USA, Article 1450, 15 pages."},{"key":"e_1_3_2_2_81_1","doi-asserted-by":"crossref","unstructured":"Biao Zhang Ivan Titov and Rico Sennrich. 2021. Sparse Attention with Linear Units. arXiv:2104.07012 [cs.CL]","DOI":"10.18653\/v1\/2021.emnlp-main.523"},{"key":"e_1_3_2_2_82_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=eG9AkHtYYH","author":"Zhang Haojie","year":"2024","unstructured":"Haojie Zhang, Zhenyu Wu, Zhenyu Zhang, Shixing Liu, Zujie Ren, Jingji Chen, Mingjie Zhan, Zirui Liu, Chen-Yu Lee, Yuchen Zhang, Haichuan Yang, Yuxiang Liu, Yafan He, and Zhaofeng He. 2024. SemSA: Semantic Sparse Attention is hidden in Large Language Models.. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=eG9AkHtYYH"},{"key":"e_1_3_2_2_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/3731445"},{"key":"e_1_3_2_2_84_1","first-page":"34661","article-title":"H2o: Heavy-hitter oracle for efficient generative inference of large language models","volume":"36","author":"Zhang Zhenyu","year":"2023","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher R\u00e9, Clark Barrett, et al. 2023. H2o: Heavy-hitter oracle for efficient generative inference of large language models. Advances in Neural Information Processing Systems 36 (2023), 34661\u201334710.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_85_1","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference. 13691\u201313701","author":"Zhou Junjie","year":"2025","unstructured":"Junjie Zhou, Yan Shu, Bo Zhao, Boya Wu, Zhengyang Liang, Shitao Xiao, Minghao Qin, Xi Yang, Yongping Xiong, Bo Zhang, et al. 2025. Mlvu: Benchmarking multitask long video understanding. In Proceedings of the Computer Vision and Pattern Recognition Conference. 13691\u201313701."},{"key":"e_1_3_2_2_86_1","unstructured":"Jinguo Zhu Weiyun Wang Zhe Chen Zhaoyang Liu Shenglong Ye Lixin Gu Hao Tian Yuchen Duan Weijie Su Jie Shao et al. 2025. Internvl3: Exploring advanced training and test-time recipes for open-source multimodal models. arXiv preprint arXiv:2504.10479 (2025)."}],"event":{"name":"MobiSys '26: 24th Annual International Conference on Mobile Systems, Applications and Services","location":"University of Cambridge Cambridge United Kingdom","acronym":"MobiSys '26","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 24th Annual International Conference on Mobile Systems, Applications and Services"],"original-title":[],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:56:55Z","timestamp":1780059415000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3745756.3809234"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,20]]},"references-count":86,"alternative-id":["10.1145\/3745756.3809234","10.1145\/3745756"],"URL":"https:\/\/doi.org\/10.1145\/3745756.3809234","relation":{},"subject":[],"published":{"date-parts":[[2026,6,20]]},"assertion":[{"value":"2026-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}