{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T13:14:37Z","timestamp":1780060477975,"version":"3.54.0"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T00:00:00Z","timestamp":1781913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,21]]},"DOI":"10.1145\/3745756.3809243","type":"proceedings-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:52:21Z","timestamp":1780059141000},"page":"854-867","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["VLMCache: Efficient On-Device Vision-Language Model Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-7128-6188","authenticated-orcid":false,"given":"Yinyuan","family":"Zhang","sequence":"first","affiliation":[{"name":"Key Laboratory of High Confidence Software Technologies (Peking University), Ministry of Education; School of Computer Science, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6775-0688","authenticated-orcid":false,"given":"Daliang","family":"Xu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology; School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8607-8539","authenticated-orcid":false,"given":"Zhiyang","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2589-6316","authenticated-orcid":false,"given":"Chenghua","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology; School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6924-2319","authenticated-orcid":false,"given":"Ying","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of High Confidence Software Technologies (Peking University), Ministry of Education; School of Computer Science, Peking University, Beijing, China"},{"name":"National Engineering Research Center of Software Engineering, Peking University, Beijing, China"},{"name":"Beijing Tongming Lake Information Technology Application Innovation Center (TLAIC), Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6271-6993","authenticated-orcid":false,"given":"Mengwei","family":"Xu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology; School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4686-3181","authenticated-orcid":false,"given":"Gang","family":"Huang","sequence":"additional","affiliation":[{"name":"Key Laboratory of High Confidence Software Technologies (Peking University), Ministry of Education; School of Computer Science, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3725273"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.2196\/59505"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240765.3240799"},{"key":"e_1_3_2_1_4_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu et al. 2024. Qwen2.5 technical report. arXiv preprint arXiv:2412.15115 (2024)."},{"key":"e_1_3_2_1_5_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_6_1","volume-title":"The revolution of multimodal large language models: a survey. arXiv preprint arXiv:2402.12451","author":"Caffagni Davide","year":"2024","unstructured":"Davide Caffagni, Federico Cocchi, Luca Barsellotti, Nicholas Moratelli, Sara Sarto, Lorenzo Baraldi, Marcella Cornia, and Rita Cucchiara. 2024. The revolution of multimodal large language models: a survey. arXiv preprint arXiv:2402.12451 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3731569.3764808"},{"key":"e_1_3_2_1_8_1","volume-title":"Mobilevlm: A fast, strong and open vision language assistant for mobile devices. arXiv preprint arXiv:2312.16886","author":"Chu Xiangxiang","year":"2023","unstructured":"Xiangxiang Chu, Limeng Qiao, Xinyang Lin, Shuang Xu, Yang Yang, Yiming Hu, Fei Wei, Xinyu Zhang, Bo Zhang, Xiaolin Wei, et al. 2023. Mobilevlm: A fast, strong and open vision language assistant for mobile devices. arXiv preprint arXiv:2312.16886 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy. 2020. An image is worth 16\u00d716 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_10_1","unstructured":"Matthijs Douze Alexandr Guzhva Chengqi Deng Jeff Johnson Gergely Szilvasy Pierre-Emmanuel Mazar\u00e9 Maria Lomeli Lucas Hosseini and Herv\u00e9 J\u00e9gou. 2024. The Faiss library. (2024). arXiv:2401.08281 [cs.LG]"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01151"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01433"},{"key":"e_1_3_2_1_13_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_14_1","first-page":"325","article-title":"Prompt cache: Modular attention reuse for low-latency inference","volume":"6","author":"Gim In","year":"2024","unstructured":"In Gim, Guojun Chen, Seung-seob Lee, Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. 2024. Prompt cache: Modular attention reuse for low-latency inference. Proceedings of Machine Learning and Systems 6 (2024), 325\u2013338.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1155\/2022\/4255220"},{"key":"e_1_3_2_1_16_1","volume-title":"Scaling llm test-time compute with mobile npu on smartphones. arXiv preprint arXiv:2509.23324","author":"Hao Zixu","year":"2025","unstructured":"Zixu Hao, Jianyu Wei, Tuowei Wang, Minxing Huang, Huiqiang Jiang, Shiqi Jiang, Ting Cao, and Ju Ren. 2025. Scaling llm test-time compute with mobile npu on smartphones. arXiv preprint arXiv:2509.23324 (2025)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1305"},{"key":"e_1_3_2_1_18_1","volume-title":"EPIC: Efficient Position-Independent Caching for Serving Large Language Models. arXiv preprint arXiv:2410.15332","author":"Hu Junhao","year":"2024","unstructured":"Junhao Hu, Wenrui Huang, Weidong Wang, Haoyi Wang, Tiancheng Hu, Qin Zhang, Hao Feng, Xusheng Chen, Yizhou Shan, and Tao Xie. 2024. EPIC: Efficient Position-Independent Caching for Serving Large Language Models. arXiv preprint arXiv:2410.15332 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"EPIC: Efficient Position-Independent Caching for Serving Large Language Models. arXiv:2410.15332","author":"Hu Junhao","year":"2025","unstructured":"Junhao Hu, Wenrui Huang, Weidong Wang, Haoyi Wang, Tiancheng Hu, Qin Zhang, Hao Feng, Xusheng Chen, Yizhou Shan, and Tao Xie. 2025. EPIC: Efficient Position-Independent Caching for Serving Large Language Models. arXiv:2410.15332"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.3390\/machines11070677"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3768628"},{"key":"e_1_3_2_1_22_1","volume-title":"KVzip: Query-Agnostic KV Cache Compression with Context Reconstruction. arXiv preprint arXiv:2505.23416","author":"Kim Jang-Hyun","year":"2025","unstructured":"Jang-Hyun Kim, Jinuk Kim, Sangwoo Kwon, Jae W Lee, Sangdoo Yun, and Hyun Oh Song. 2025. KVzip: Query-Agnostic KV Cache Compression with Context Reconstruction. arXiv preprint arXiv:2505.23416 (2025)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_25_1","volume-title":"A survey on large language model acceleration based on kv cache management. arXiv preprint arXiv:2412.19442","author":"Li Haoyang","year":"2024","unstructured":"Haoyang Li, Yiming Li, Anxin Tian, Tianhao Tang, Zhanchao Xu, Xuejia Chen, Nicole Hu, Wei Dong, Qing Li, and Lei Chen. 2024. A survey on large language model acceleration based on kv cache management. arXiv preprint arXiv:2412.19442 (2024)."},{"key":"e_1_3_2_1_26_1","first-page":"92130","article-title":"On the effects of data scale on ui control agents","volume":"37","author":"Li Wei","year":"2024","unstructured":"Wei Li, William E Bishop, Alice Li, Christopher Rawles, Folawiyo Campbell-Ajala, Divya Tyamagundlu, and Oriana Riva. 2024. On the effects of data scale on ui control agents. Advances in Neural Information Processing Systems 37 (2024), 92130\u201392154.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.3389\/fnbot.2024.1513354"},{"key":"e_1_3_2_1_29_1","volume-title":"Mixture-of-transformers: A sparse and scalable architecture for multi-modal foundation models. arXiv preprint arXiv:2411.04996","author":"Liang Weixin","year":"2024","unstructured":"Weixin Liang, Lili Yu, Liang Luo, Srinivasan Iyer, Ning Dong, Chunting Zhou, Gargi Ghosh, Mike Lewis, Wen-tau Yih, Luke Zettlemoyer, et al. 2024. Mixture-of-transformers: A sparse and scalable architecture for multi-modal foundation models. arXiv preprint arXiv:2411.04996 (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2884203"},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of machine learning and systems 6","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. 2024. Awq: Activation-aware weight quantization for on-device llm compression and acceleration. Proceedings of machine learning and systems 6 (2024), 87\u2013100."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672274"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00392"},{"key":"e_1_3_2_1_36_1","unstructured":"Xudong Lu Yinghao Chen Renshou Wu Haohao Gao Xi Chen Xue Yang Xiangyu Zhao Aojun Zhou Fangyuan Li Yafei Wen et al. 2025. GenieBlue: Integrating both Linguistic and Multimodal Capabilities for Large Language Models on Mobile Devices. arXiv preprint arXiv:2503.06019 (2025)."},{"key":"e_1_3_2_1_37_1","unstructured":"MLC team. 2023\u20132025. MLC-LLM. https:\/\/github.com\/mlc-ai\/mlc-llm"},{"key":"e_1_3_2_1_38_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_39_1","volume-title":"Vins-mono: A robust and versatile monocular visual-inertial state estimator","author":"Qin Tong","year":"2018","unstructured":"Tong Qin, Peiliang Li, and Shaojie Shen. 2018. Vins-mono: A robust and versatile monocular visual-inertial state estimator. IEEE transactions on robotics 34, 4 (2018), 1004\u20131020."},{"key":"e_1_3_2_1_40_1","unstructured":"Qualcomm. 2025. QNN SDK. https:\/\/docs.qualcomm.com\/bundle\/publicresource\/topics\/80-63442-50\/introduction.html"},{"key":"e_1_3_2_1_41_1","volume-title":"International Conference on Machine Learning (ICML).","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_42_1","volume-title":"FineGym: A Hierarchical Video Dataset for Fine-grained Action Understanding. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Shao Dian","year":"2020","unstructured":"Dian Shao, Yue Zhao, Bo Dai, and Dahua Lin. 2020. FineGym: A Hierarchical Video Dataset for Fine-grained Action Understanding. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_43_1","volume-title":"Imp: Highly capable large multimodal models for mobile devices","author":"Shao Zhenwei","year":"2025","unstructured":"Zhenwei Shao, Zhou Yu, Jun Yu, Xuecheng Ouyang, Lihao Zheng, Zhenbiao Gai, Mingyang Wang, Zhenzhong Kuang, and Jiajun Ding. 2025. Imp: Highly capable large multimodal models for mobile devices. IEEE Transactions on Multimedia (2025)."},{"key":"e_1_3_2_1_44_1","volume-title":"Keep the cost down: A review on methods to optimize LLM's KV-cache consumption. arXiv preprint arXiv:2407.18003","author":"Shi Luohe","year":"2024","unstructured":"Luohe Shi, Hongyi Zhang, Yao Yao, Zuchao Li, and Hai Zhao. 2024. Keep the cost down: A review on methods to optimize LLM's KV-cache consumption. arXiv preprint arXiv:2407.18003 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","first-page":"104297","DOI":"10.1016\/j.ipm.2025.104297","article-title":"SMR-agents: Synergistic medical reasoning agents for zero-shot medical visual question answering with MLLMs","volume":"63","author":"Wang Dujuan","year":"2026","unstructured":"Dujuan Wang, Tao Cheng, Sutong Wang, Youhua Frank Chen, and Yunqiang Yin. 2026. SMR-agents: Synergistic medical reasoning agents for zero-shot medical visual question answering with MLLMs. Information Processing & Management 63, 1 (2026), 104297.","journal-title":"Information Processing & Management"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01202"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3214306"},{"key":"e_1_3_2_1_49_1","volume-title":"Open-Qwen2VL: Compute-Efficient Pre-Training of Fully-Open Multimodal LLMs on Academic Resources. arXiv preprint arXiv:2504.00595","author":"Wang Weizhi","year":"2025","unstructured":"Weizhi Wang, Yu Tian, Linjie Yang, Heng Wang, and Xifeng Yan. 2025. Open-Qwen2VL: Compute-Efficient Pre-Training of Fully-Open Multimodal LLMs on Academic Resources. arXiv preprint arXiv:2504.00595 (2025)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02766"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.1079"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32945"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.296"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707239"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3241539.3241563"},{"key":"e_1_3_2_1_56_1","volume-title":"Kvlink: Accelerating large language models via efficient kv cache reuse. arXiv preprint arXiv:2502.16002","author":"Yang Jingbo","year":"2025","unstructured":"Jingbo Yang, Bairu Hou, Wei Wei, Yujia Bao, and Shiyu Chang. 2025. Kvlink: Accelerating large language models via efficient kv cache reuse. arXiv preprint arXiv:2502.16002 (2025)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642517"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696098"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-025-61040-5"},{"key":"e_1_3_2_1_60_1","volume-title":"Chunkattention: Efficient self-attention with prefix-aware kv cache and two-phase partition. arXiv preprint arXiv:2402.15220","author":"Ye Lu","year":"2024","unstructured":"Lu Ye, Ze Tao, Yong Huang, and Yang Li. 2024. Chunkattention: Efficient self-attention with prefix-aware kv cache and two-phase partition. arXiv preprint arXiv:2402.15220 (2024)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643832.3661407"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649361"},{"key":"e_1_3_2_1_63_1","volume-title":"Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601","author":"Zhang Duzhen","year":"2024","unstructured":"Duzhen Zhang, Yahan Yu, Jiahua Dong, Chenxing Li, Dan Su, Chenhui Chu, and Dong Yu. 2024. Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601 (2024)."},{"key":"e_1_3_2_1_64_1","volume-title":"Privacy-aware offloading strategy via self-supervised feature mapping in the end-edge-cloud system. ACM Transactions on Sensor Networks","author":"Zhang Rui","year":"2024","unstructured":"Rui Zhang, Xuemei Zhao, Yajing Li, Shipu Zheng, Ruhui Ma, Mengke Tian, Youhua Xue, Yong Wang, and Haibing Guan. 2024. Privacy-aware offloading strategy via self-supervised feature mapping in the end-edge-cloud system. ACM Transactions on Sensor Networks (2024)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447993.3448628"},{"key":"e_1_3_2_1_66_1","volume-title":"MPIC: Position-Independent Multimodal Context Caching System for Efficient MLLM Serving. arXiv:2502.01960","author":"Zhao Shiju","year":"2025","unstructured":"Shiju Zhao, Junhao Hu, Rongxiao Huang, Jiaqi Zheng, and Guihai Chen. 2025. MPIC: Position-Independent Multimodal Context Caching System for Efficient MLLM Serving. arXiv:2502.01960"},{"key":"e_1_3_2_1_67_1","volume-title":"Deep learning classification by ResNet-18 based on the real spectral dataset from multispectral remote sensing images. Remote sensing 14, 19","author":"Zhao Yi","year":"2022","unstructured":"Yi Zhao, Xinchang Zhang, Weiming Feng, and Jianhui Xu. 2022. Deep learning classification by ResNet-18 based on the real spectral dataset from multispectral remote sensing images. Remote sensing 14, 19 (2022), 4883."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02250"},{"key":"e_1_3_2_1_69_1","volume-title":"Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al.","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Chuyue Livia Sun, Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. 2024. Sglang: Efficient execution of structured language model programs. Advances in neural information processing systems 37 (2024), 62557\u201362583."},{"key":"e_1_3_2_1_70_1","volume-title":"Tinyllava: A framework of small-scale large multimodal models. arXiv preprint arXiv:2402.14289","author":"Zhou Baichuan","year":"2024","unstructured":"Baichuan Zhou, Ying Hu, Xi Weng, Junlong Jia, Jie Luo, Xien Liu, Ji Wu, and Lei Huang. 2024. Tinyllava: A framework of small-scale large multimodal models. arXiv preprint arXiv:2402.14289 (2024)."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3688863.3689575"}],"event":{"name":"MobiSys '26: 24th Annual International Conference on Mobile Systems, Applications and Services","location":"University of Cambridge Cambridge United Kingdom","acronym":"MobiSys '26","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 24th Annual International Conference on Mobile Systems, Applications and Services"],"original-title":[],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:53:57Z","timestamp":1780059237000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3745756.3809243"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,20]]},"references-count":71,"alternative-id":["10.1145\/3745756.3809243","10.1145\/3745756"],"URL":"https:\/\/doi.org\/10.1145\/3745756.3809243","relation":{},"subject":[],"published":{"date-parts":[[2026,6,20]]},"assertion":[{"value":"2026-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}