{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,17]],"date-time":"2026-02-17T13:02:07Z","timestamp":1771333327625,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,24]],"date-time":"2024-08-24T00:00:00Z","timestamp":1724457600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"SJTU-Huawei Explore X Gift Fund"},{"name":"China NSF grant","award":["62025204,62202296,62272293"],"award-info":[{"award-number":["62025204,62202296,62272293"]}]},{"name":"Tencent WeChat Research Fund"},{"name":"National Key R&D Program of China","award":["2022ZD0119100"],"award-info":[{"award-number":["2022ZD0119100"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,25]]},"DOI":"10.1145\/3637528.3671679","type":"proceedings-article","created":{"date-parts":[[2024,8,25]],"date-time":"2024-08-25T04:55:12Z","timestamp":1724561712000},"page":"597-608","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["Enhancing On-Device LLM Inference with Historical Cloud-Based LLM Interactions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6095-4947","authenticated-orcid":false,"given":"Yucheng","family":"Ding","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1650-4233","authenticated-orcid":false,"given":"Chaoyue","family":"Niu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0965-9058","authenticated-orcid":false,"given":"Fan","family":"Wu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9261-5210","authenticated-orcid":false,"given":"Shaojie","family":"Tang","sequence":"additional","affiliation":[{"name":"University of Texas at Dallas, Richardson, Texas, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2901-9969","authenticated-orcid":false,"given":"Chengfei","family":"Lyu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6934-1685","authenticated-orcid":false,"given":"Guihai","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,8,24]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"2023. Alpaca Dataset. https:\/\/github.com\/tatsu-lab\/stanford_alpaca\/blob\/main\/alpaca_data.json."},{"key":"e_1_3_2_2_2_1","unstructured":"2023. Apple Language Model Research Report. https:\/\/www.theverge.com\/2023\/9\/6\/23861763\/apple-ai-language-models-ajax-gpt-training-spending."},{"key":"e_1_3_2_2_3_1","unstructured":"2023. Chinese Alpaca Dataset. https:\/\/github.com\/LC1332\/Chinese-alpaca-lora."},{"key":"e_1_3_2_2_4_1","unstructured":"2023. Google Gemini. https:\/\/cloud.google.com\/blog\/products\/ai-machinelearning\/gemini-support-on-vertex-ai."},{"key":"e_1_3_2_2_5_1","unstructured":"2023. Huawei Noah. https:\/\/github.com\/huawei-noah\/Pretrained-Language-Model."},{"key":"e_1_3_2_2_6_1","unstructured":"2023. MediaTech On-Device LLM Report. https:\/\/www.mediatek.com\/blog\/mediatek-research-launches-the-worlds-first-ai-llm-in-traditional-chinese."},{"key":"e_1_3_2_2_7_1","unstructured":"2023. MiLM. https:\/\/github.com\/XiaoMi\/MiLM-6B."},{"key":"e_1_3_2_2_8_1","unstructured":"2023. Personalized Interactive Conversations Dataset. https:\/\/huggingface.co\/datasets\/erbacher\/personalized-interactive-conversations."},{"key":"e_1_3_2_2_9_1","unstructured":"2023. Qualcomm On-Device LLM Report. https:\/\/www.qualcomm.com\/ news\/releases\/2023\/07\/qualcomm-works-with-meta-to-enable-on-device-aiapplications-usi."},{"key":"e_1_3_2_2_10_1","unstructured":"2023. Retrieval-based Language Models and Applications. https:\/\/acl2023-retrieval-lm.github.io\/."},{"key":"e_1_3_2_2_11_1","unstructured":"2023. ShareGPT Dataset. https:\/\/huggingface.co\/datasets\/shareAI\/ShareGPTChinese-English-90k."},{"key":"e_1_3_2_2_12_1","unstructured":"2023. TinyLlama. https:\/\/huggingface.co\/PY007\/TinyLlama-1.1B-Chat-v0.1."},{"key":"e_1_3_2_2_13_1","unstructured":"2023. UC Berkeley Vicuna. https:\/\/github.com\/eddieali\/Vicuna-AI-LLM."},{"key":"e_1_3_2_2_14_1","unstructured":"2023. vivo BlueLM. https:\/\/developers.vivo.com\/product\/ai\/bluelm."},{"key":"e_1_3_2_2_15_1","unstructured":"2024. Alibaba Qwen. https:\/\/github.com\/QwenLM\/Qwen."},{"key":"e_1_3_2_2_16_1","unstructured":"2024. Baichuan2. https:\/\/github.com\/baichuan-inc\/Baichuan2."},{"key":"e_1_3_2_2_17_1","unstructured":"2024. Meta Llama. https:\/\/github.com\/facebookresearch\/llama."},{"key":"e_1_3_2_2_18_1","unstructured":"2024. MLC-LLM. https:\/\/llm.mlc.ai\/."},{"key":"e_1_3_2_2_19_1","unstructured":"2024. OpenAI ChatGPT. https:\/\/chat.openai.com\/."},{"key":"e_1_3_2_2_20_1","unstructured":"2024. Openompass LLM Leaderboard. https:\/\/rank.opencompass.org.cn\/leaderboard-llm."},{"key":"e_1_3_2_2_21_1","volume-title":"Mohammad Rastegari, and Mehrdad Farajtabar.","author":"Alizadeh Keivan","year":"2023","unstructured":"Keivan Alizadeh, Iman Mirzadeh, Dmitry Belenko, Karen Khatamifard, Minsik Cho, Carlo C. Del Mundo, Mohammad Rastegari, and Mehrdad Farajtabar. 2023. LLM in a flash: Efficient Large Language Model Inference with Limited Memory. CoRR abs\/2312.11514 (2023)."},{"key":"e_1_3_2_2_22_1","volume-title":"Herbie Bradley, Kyle O'Brien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, and Oskar van der Wal.","author":"Biderman Stella","year":"2023","unstructured":"Stella Biderman, Hailey Schoelkopf, Quentin Gregory Anthony, Herbie Bradley, Kyle O'Brien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, and Oskar van der Wal. 2023. Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling. In ICML (Proceedings of Machine Learning Research, Vol. 202). PMLR, 2397--2430."},{"key":"e_1_3_2_2_23_1","volume-title":"zhu Lin, and Mengwei Xu","author":"Cai Dongqi","year":"2023","unstructured":"Dongqi Cai, Yaozong Wu, Shangguang Wang, Felix Xiao, zhu Lin, and Mengwei Xu. 2023. Efficient Federated Learning for Modern NLP. In MobiCom. ACM, Madrid, Spain, 14 pages."},{"key":"e_1_3_2_2_24_1","volume-title":"Accelerating Large Language Model Decoding with Speculative Sampling. CoRR abs\/2302.01318","author":"Chen Charlie","year":"2023","unstructured":"Charlie Chen, Sebastian Borgeaud, Geoffrey Irving, Jean-Baptiste Lespiau, Laurent Sifre, and John Jumper. 2023. Accelerating Large Language Model Decoding with Speculative Sampling. CoRR abs\/2302.01318 (2023)."},{"key":"e_1_3_2_2_25_1","volume-title":"DC-CCL: Device-Cloud Collaborative Controlled Learning for Large Vision Models. CoRR abs\/2303.10361","author":"Ding Yucheng","year":"2023","unstructured":"Yucheng Ding, Chaoyue Niu, Fan Wu, Shaojie Tang, Chengfei Lyu, and Guihai Chen. 2023. DC-CCL: Device-Cloud Collaborative Controlled Learning for Large Vision Models. CoRR abs\/2303.10361 (2023)."},{"key":"e_1_3_2_2_26_1","volume-title":"Gemini: A Family of Highly Capable Multimodal Models. CoRR abs\/2312.11805","author":"Google Gemini Team","year":"2023","unstructured":"Gemini Team Google. 2023. Gemini: A Family of Highly Capable Multimodal Models. CoRR abs\/2312.11805 (2023)."},{"key":"e_1_3_2_2_27_1","volume-title":"REALM: Retrieval-Augmented Language Model Pre-Training. CoRR abs\/2002.08909","author":"Guu Kelvin","year":"2020","unstructured":"Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Ming-Wei Chang. 2020. REALM: Retrieval-Augmented Language Model Pre-Training. CoRR abs\/2002.08909 (2020)."},{"key":"e_1_3_2_2_28_1","volume-title":"EMNLP (1)","author":"He Junxian","unstructured":"Junxian He, Graham Neubig, and Taylor Berg-Kirkpatrick. 2021. Efficient Nearest Neighbor Language Models. In EMNLP (1). Association for Computational Linguistics, 5703--5714."},{"key":"e_1_3_2_2_29_1","volume-title":"ICML (Proceedings of Machine Learning Research","volume":"2799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin de Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-Efficient Transfer Learning for NLP. In ICML (Proceedings of Machine Learning Research, Vol. 97). PMLR, 2790--2799."},{"key":"e_1_3_2_2_30_1","unstructured":"Edward J. Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In ICLR. OpenReview.net Virtual 13 pages."},{"key":"e_1_3_2_2_31_1","volume-title":"ICML","volume":"119","author":"Karimireddy Sai Praneeth","year":"2020","unstructured":"Sai Praneeth Karimireddy, Satyen Kale, Mehryar Mohri, Sashank J. Reddi, Sebastian U. Stich, and Ananda Theertha Suresh. 2020. SCAFFOLD: Stochastic Controlled Averaging for Federated Learning. In ICML, Vol. 119. PMLR, Virtual, 5132--5143."},{"key":"e_1_3_2_2_32_1","unstructured":"Urvashi Khandelwal Omer Levy Dan Jurafsky Luke Zettlemoyer and Mike Lewis. 2020. Generalization through Memorization: Nearest Neighbor Language Models. In ICLR. OpenReview.net."},{"key":"e_1_3_2_2_33_1","volume-title":"ICML (Proceedings of Machine Learning Research","volume":"19286","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. 2023. Fast Inference from Transformers via Speculative Decoding. In ICML (Proceedings of Machine Learning Research, Vol. 202). PMLR, 19274--19286."},{"key":"e_1_3_2_2_34_1","unstructured":"Patrick S. H. Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel Sebastian Riedel and Douwe Kiela. 2020. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In NeurIPS."},{"key":"e_1_3_2_2_35_1","volume-title":"ACL\/IJCNLP (1)","author":"Li Xiang Lisa","unstructured":"Xiang Lisa Li and Percy Liang. 2021. Prefix-Tuning: Optimizing Continuous Prompts for Generation. In ACL\/IJCNLP (1). Association for Computational Linguistics, 4582--4597."},{"key":"e_1_3_2_2_36_1","volume-title":"Suriya Gunasekar, and Yin Tat Lee.","author":"Li Yuanzhi","year":"2023","unstructured":"Yuanzhi Li, S\u00e9bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar, and Yin Tat Lee. 2023. Textbooks Are All You Need II: phi-1.5 technical report. CoRR abs\/2309.05463 (2023)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580795"},{"key":"e_1_3_2_2_38_1","volume-title":"Jing Kai Siow, and Yang Liu","author":"Liu Shangqing","year":"2021","unstructured":"Shangqing Liu, Yu Chen, Xiaofei Xie, Jing Kai Siow, and Yang Liu. 2021. Retrieval-Augmented Generation for Code Summarization via Hybrid GNN. In ICLR. Open-Review.net."},{"key":"e_1_3_2_2_39_1","volume-title":"Walle: An End-to-End, General-Purpose, and Large-Scale Production System for Device-Cloud Collaborative Machine Learning. In OSDI","author":"Lv Chengfei","year":"2022","unstructured":"Chengfei Lv, Chaoyue Niu, Renjie Gu, Xiaotang Jiang, Zhaode Wang, Bin Liu, Ziqi Wu, Qiulin Yao, Congyu Huang, Panos Huang, Tao Huang, Hui Shu, Jinde Song, Bin Zou, Peng Lan, Guohuan Xu, Fei Wu, Shaojie Tang, Fan Wu, and Guihai Chen. 2022. Walle: An End-to-End, General-Purpose, and Large-Scale Production System for Device-Cloud Collaborative Machine Learning. In OSDI. USENIX, Carlsbad, CA, USA, 249--265."},{"key":"e_1_3_2_2_40_1","volume-title":"andWeizhu Chen","author":"Mao Yuning","year":"2021","unstructured":"Yuning Mao, Pengcheng He, Xiaodong Liu, Yelong Shen, Jianfeng Gao, Jiawei Han, andWeizhu Chen. 2021. Generation-Augmented Retrieval for Open-Domain Question Answering. In ACL\/IJCNLP (1). Association for Computational Linguistics, 4089--4100."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-012-9338-y"},{"key":"e_1_3_2_2_42_1","unstructured":"Brendan McMahan Eider Moore Daniel Ramage Seth Hampson and Blaise Ag\u00fcera y Arcas. 2017. Communication-Efficient Learning of Deep Networks from Decentralized Data. In AISTATS. JMLR Fort Lauderdale USA 1273--1282."},{"key":"e_1_3_2_2_43_1","volume-title":"Lazier Than Lazy Greedy","author":"Mirzasoleiman Baharan","year":"1812","unstructured":"Baharan Mirzasoleiman, Ashwinkumar Badanidiyuru, Amin Karbasi, Jan Vondr\u00e1k, and Andreas Krause. 2015. Lazier Than Lazy Greedy. In AAAI. AAAI Press, 1812--1818."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610917"},{"key":"e_1_3_2_2_46_1","volume-title":"A Comprehensive Survey of Hallucination Mitigation Techniques in Large Language Models. CoRR abs\/2401.01313","author":"Towhidul Islam Tonmoy S. M.","year":"2024","unstructured":"S. M. Towhidul Islam Tonmoy, S. M. Mehedi Zaman, Vinija Jain, Anku Rani, Vipula Rawte, Aman Chadha, and Amitava Das. 2024. A Comprehensive Survey of Hallucination Mitigation Techniques in Large Language Models. CoRR abs\/2401.01313 (2024)."},{"key":"e_1_3_2_2_47_1","volume-title":"ACL (1)","author":"Wang Dexin","unstructured":"Dexin Wang, Kai Fan, Boxing Chen, and Deyi Xiong. 2022. Efficient Cluster-Based k-Nearest-Neighbor Machine Translation. In ACL (1). Association for Computational Linguistics, 2175--2187."},{"key":"e_1_3_2_2_48_1","volume-title":"Cloud-Device Collaborative Learning for Multimodal Large Language Models. CoRR abs\/2312.16279","author":"Liu Jiaming","year":"2023","unstructured":"GuanqunWang, Jiaming Liu, Chenxuan Li, Junpeng Ma, Yuan Zhang, XinyuWei, Kevin Zhang, Maurice Chong, Ray Zhang, Yijiang Liu, and Shanghang Zhang. 2023. Cloud-Device Collaborative Learning for Multimodal Large Language Models. CoRR abs\/2312.16279 (2023)."},{"key":"e_1_3_2_2_49_1","volume-title":"Offsite-Tuning: Transfer Learning without Full Model. CoRR abs\/2302.04870","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, and Song Han. 2023. Offsite-Tuning: Transfer Learning without Full Model. CoRR abs\/2302.04870 (2023)."},{"key":"e_1_3_2_2_50_1","volume-title":"LLMCad: Fast and Scalable On-device Large Language Model Inference. CoRR abs\/2309.04255","author":"Xu Daliang","year":"2023","unstructured":"Daliang Xu, Wangsong Yin, Xin Jin, Ying Zhang, Shiyun Wei, Mengwei Xu, and Xuanzhe Liu. 2023. LLMCad: Fast and Scalable On-device Large Language Model Inference. CoRR abs\/2309.04255 (2023)."},{"key":"e_1_3_2_2_51_1","volume-title":"EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models. CoRR abs\/2308.14352","author":"Yi Rongjie","year":"2023","unstructured":"Rongjie Yi, Liwei Guo, Shiyun Wei, Ao Zhou, Shangguang Wang, and Mengwei Xu. 2023. EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models. CoRR abs\/2308.14352 (2023)."},{"key":"e_1_3_2_2_52_1","volume-title":"Chan","author":"Zhan Xueying","year":"2021","unstructured":"Xueying Zhan, Huan Liu, Qing Li, and Antoni B. Chan. 2021. A Comparative Survey: Benchmarking for Pool-based Active Learning. In IJCAI. ijcai.org, 4679--4686."},{"key":"e_1_3_2_2_53_1","volume-title":"TinyLlama: An Open-Source Small Language Model. CoRR abs\/2401.02385","author":"Zhang Peiyuan","year":"2024","unstructured":"Peiyuan Zhang, Guangtao Zeng, Tianduo Wang, and Wei Lu. 2024. TinyLlama: An Open-Source Small Language Model. CoRR abs\/2401.02385 (2024)."},{"key":"e_1_3_2_2_54_1","volume-title":"Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer.","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona T. Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. CoRR abs\/2205.01068 (2022)."},{"key":"e_1_3_2_2_55_1","volume-title":"Training Language Models with Memory Augmentation","author":"Zhong Zexuan","unstructured":"Zexuan Zhong, Tao Lei, and Danqi Chen. 2022. Training Language Models with Memory Augmentation. In EMNLP. Association for Computational Linguistics, 5657--5673."},{"key":"e_1_3_2_2_56_1","volume-title":"On Optimal Caching and Model Multiplexing for Large Model Inference. CoRR abs\/2306.02003","author":"Zhu Banghua","year":"2023","unstructured":"Banghua Zhu, Ying Sheng, Lianmin Zheng, Clark W. Barrett, Michael I. Jordan, and Jiantao Jiao. 2023. On Optimal Caching and Model Multiplexing for Large Model Inference. CoRR abs\/2306.02003 (2023)."}],"event":{"name":"KDD '24: The 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Barcelona Spain","acronym":"KDD '24","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671679","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3637528.3671679","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:06:00Z","timestamp":1750291560000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671679"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,24]]},"references-count":55,"alternative-id":["10.1145\/3637528.3671679","10.1145\/3637528"],"URL":"https:\/\/doi.org\/10.1145\/3637528.3671679","relation":{},"subject":[],"published":{"date-parts":[[2024,8,24]]},"assertion":[{"value":"2024-08-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}