{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T17:20:38Z","timestamp":1774718438494,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T00:00:00Z","timestamp":1717372800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,3]]},"DOI":"10.1145\/3662006.3662067","type":"proceedings-article","created":{"date-parts":[[2024,6,11]],"date-time":"2024-06-11T12:23:36Z","timestamp":1718108616000},"page":"36-41","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":40,"title":["Hybrid SLM and LLM for Edge-Cloud Collaborative Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-1671-1367","authenticated-orcid":false,"given":"Zixu","family":"Hao","sequence":"first","affiliation":[{"name":"Tsinghua University, Microsoft Research"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1327-4882","authenticated-orcid":false,"given":"Huiqiang","family":"Jiang","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4685-9633","authenticated-orcid":false,"given":"Shiqi","family":"Jiang","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2782-183X","authenticated-orcid":false,"given":"Ju","family":"Ren","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9107-013X","authenticated-orcid":false,"given":"Ting","family":"Cao","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]}],"member":"320","published-online":{"date-parts":[[2024,6,11]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3510831"},{"key":"e_1_3_2_1_2_1","volume-title":"Medusa: Simple 11m inference acceleration framework with multiple decoding heads. arXiv preprint arXiv:2401.10774","author":"Cai Tianle","year":"2024","unstructured":"Tianle Cai, Yuhong Li, Zhengyang Geng, Hongwu Peng, Jason D Lee, Deming Chen, and Tri Dao. 2024. Medusa: Simple 11m inference acceleration framework with multiple decoding heads. arXiv preprint arXiv:2401.10774 (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:2302.01318","author":"Chen Charlie","year":"2023","unstructured":"Charlie Chen, Sebastian Borgeaud, Geoffrey Irving, Jean-Baptiste Lespiau, Laurent Sifre, and John Jumper. 2023. Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:2302.01318 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al.","author":"Chen Mark","year":"2021","unstructured":"Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. 2021. Evaluating large language models trained on code. arXiv preprint arXiv:2107.033 74 (2021)."},{"key":"e_1_3_2_1_5_1","volume-title":"Netgpt: A native-ai network architecture beyond provisioning personalized generative services. arXiv preprint arXiv:2307.06148","author":"Chen Yuxuan","year":"2023","unstructured":"Yuxuan Chen, Rongpeng Li, Zhifeng Zhao, Chenghui Peng, Jianjun Wu, Ekram Hossain, and Honggang Zhang. 2023. Netgpt: A native-ai network architecture beyond provisioning personalized generative services. arXiv preprint arXiv:2307.06148 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Sequoia: Scalable, Robust, and Hardware-aware Speculative Decoding. arXiv preprint arXiv:2402.12374","author":"Chen Zhuoming","year":"2024","unstructured":"Zhuoming Chen, Avner May, Ruslan Svirschevski, Yuhsun Huang, Max Ryabinin, Zhihao Jia, and Beidi Chen. 2024. Sequoia: Scalable, Robust, and Hardware-aware Speculative Decoding. arXiv preprint arXiv:2402.12374 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"Mobilevlm: A fast, reproducible and strong vision language assistant for mobile devices. arXiv preprint arXiv:2312.16886","author":"Chu Xiangxiang","year":"2023","unstructured":"Xiangxiang Chu, Limeng Qiao, Xinyang Lin, Shuang Xu, Yang Yang, Yiming Hu, Fei Wei, Xinyu Zhang, Bo Zhang, Xiaolin Wei, et al. 2023. Mobilevlm: A fast, reproducible and strong vision language assistant for mobile devices. arXiv preprint arXiv:2312.16886 (2023)."},{"key":"e_1_3_2_1_8_1","unstructured":"Xiangxiang Chu Limeng Qiao Xinyu Zhang Shuang Xu Fei Wei Yang Yang Xiaofei Sun Yiming Hu Xinyang Lin Bo Zhang et al. 2024. MobileVLM V2: Faster and Stronger Baseline for Vision Language Model. arXiv preprint arXiv:2402.03766 (2024)."},{"key":"e_1_3_2_1_9_1","unstructured":"Karl Cobbe Vineet Kosaraju Mohammad Bavarian Mark Chen Heewoo Jun Lukasz Kaiser Matthias Plappert Jerry Tworek Jacob Hilton Reiichiro Nakano et al. 2021. Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168 (2021)."},{"key":"e_1_3_2_1_10_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3560905.3568511"},{"key":"e_1_3_2_1_12_1","volume-title":"Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, et al.","author":"Gunasekar Suriya","year":"2023","unstructured":"Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio C\u00e9sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, et al. 2023. Textbooks are all you need. arXiv preprint arXiv:2306.11644 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Weizhu Chen, Allie Del Giorno, Ronen Eldan, Sivakanth Gopi, et al.","author":"Javaheripi Mojan","year":"2023","unstructured":"Mojan Javaheripi, S\u00e9bastien Bubeck, Marah Abdin, Jyoti Aneja, Sebastien Bubeck, Caio C\u00e9sar Teodoro Mendes, Weizhu Chen, Allie Del Giorno, Ronen Eldan, Sivakanth Gopi, et al. 2023. Phi-2: The surprising power of small language models. Microsoft Research Blog (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"Speculative decoding with big little decoder. Advances in Neural Information Processing Systems 36","author":"Kim Sehoon","year":"2024","unstructured":"Sehoon Kim, Karttikeya Mangalam, Suhong Moon, Jitendra Malik, Michael W Mahoney, Amir Gholami, and Kurt Keutzer. 2024. Speculative decoding with big little decoder. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372224.3419194"},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. 2023. Fast inference from transformers via speculative decoding. In International Conference on Machine Learning. PMLR, 19274--19286."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TWC.2019.2946140"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586176"},{"key":"e_1_3_2_1_19_1","volume-title":"Suriya Gunasekar, and Yin Tat Lee.","author":"Li Yuanzhi","year":"2023","unstructured":"Yuanzhi Li, S\u00e9bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar, and Yin Tat Lee. 2023. Textbooks are all you need ii: phi-1.5 technical report. arXiv preprint arXiv:2309.05463 (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00638"},{"key":"e_1_3_2_1_21_1","volume-title":"Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia.","author":"Miao Xupeng","year":"2023","unstructured":"Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. 2023. Specinfer: Accelerating generative llm serving with speculative inference and token tree verification. arXiv preprint arXiv:2305.09781 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1206"},{"key":"e_1_3_2_1_23_1","volume-title":"Consistent Accelerated Inference via Confident Adaptive Transformers. CoRR abs\/2104.08803","author":"Schuster Tal","year":"2021","unstructured":"Tal Schuster, Adam Fisch, Tommi S. Jaakkola, and Regina Barzilay. 2021. Consistent Accelerated Inference via Confident Adaptive Transformers. CoRR abs\/2104.08803 (2021). arXiv:2104.08803 https:\/\/arxiv.org\/abs\/2104.08803"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCWorkshops49005.2020.9145068"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/MCOM.001.2000373"},{"key":"e_1_3_2_1_26_1","volume-title":"Accelerating llm inference with staged speculative decoding. arXiv preprint arXiv:2308.04623","author":"Spector Benjamin","year":"2023","unstructured":"Benjamin Spector and Chris Re. 2023. Accelerating llm inference with staged speculative decoding. arXiv preprint arXiv:2308.04623 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"Juliette Love, et al.","author":"Team Gemma","year":"2024","unstructured":"Gemma Team, Thomas Mesnard, Cassidy Hardin, Robert Dadashi, Surya Bhupatiraju, Shreya Pathak, Laurent Sifre, Morgane Rivi\u00e8re, Mihir Sanjay Kale, Juliette Love, et al. 2024. Gemma: Open models based on gemini research and technology. arXiv preprint arXiv:2403.08295 (2024)."},{"key":"e_1_3_2_1_28_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Chain-of-Thought Reasoning Without Prompting. arXiv preprint arXiv:2402.10200","author":"Wang Xuezhi","year":"2024","unstructured":"Xuezhi Wang and Denny Zhou. 2024. Chain-of-Thought Reasoning Without Prompting. arXiv preprint arXiv:2402.10200 (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587438"},{"key":"e_1_3_2_1_31_1","volume-title":"Llmcad: Fast and scalable on-device large language model inference. arXiv preprint arXiv:2309.04255","author":"Xu Daliang","year":"2023","unstructured":"Daliang Xu, Wangsong Yin, Xin Jin, Ying Zhang, Shiyun Wei, Mengwei Xu, and Xuanzhe Liu. 2023. Llmcad: Fast and scalable on-device large language model inference. arXiv preprint arXiv:2309.04255 (2023)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3032443"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGCN.2021.3111731"},{"key":"e_1_3_2_1_34_1","volume-title":"Tinyllama: An open-source small language model. arXiv preprint arXiv:2401.02385","author":"Zhang Peiyuan","year":"2024","unstructured":"Peiyuan Zhang, Guangtao Zeng, Tianduo Wang, and Wei Lu. 2024. Tinyllama: An open-source small language model. arXiv preprint arXiv:2401.02385 (2024)."}],"event":{"name":"MOBISYS '24: The 22nd Annual International Conference on Mobile Systems, Applications and Services","location":"Minato-ku Tokyo Japan","acronym":"MOBISYS '24","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Workshop on Edge and Mobile Foundation Models"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3662006.3662067","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3662006.3662067","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T20:18:10Z","timestamp":1755980290000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3662006.3662067"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,3]]},"references-count":34,"alternative-id":["10.1145\/3662006.3662067","10.1145\/3662006"],"URL":"https:\/\/doi.org\/10.1145\/3662006.3662067","relation":{},"subject":[],"published":{"date-parts":[[2024,6,3]]},"assertion":[{"value":"2024-06-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}