{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:54:17Z","timestamp":1777064057247,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62132014"],"award-info":[{"award-number":["62132014"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Fundamental and Interdisciplinary Disciplines Breakthrough Plan of the Ministry of Education of China","award":["JYB2025XDXM113"],"award-info":[{"award-number":["JYB2025XDXM113"]}]},{"DOI":"10.13039\/501100003816","name":"Huawei Technologies","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003816","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3803572","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"126-143","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["On-device Semantic Selection Made Low Latency and Memory Efficient with Monolithic Forwarding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2023-8723","authenticated-orcid":false,"given":"Jiahao","family":"Zhou","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7315-1883","authenticated-orcid":false,"given":"Chengliang","family":"Lin","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3559-5467","authenticated-orcid":false,"given":"Dingji","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4839-055X","authenticated-orcid":false,"given":"Mingkai","family":"Dong","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9720-0361","authenticated-orcid":false,"given":"Haibo","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2023. ellamind\/wikipedia-2023-11-retrieval-multilingual-corpus \u2022 Datasets at HuggingFace. https:\/\/huggingface.co\/datasets\/ellamind\/wikipedia-2023-11-retrieval-multilingual-corpus"},{"key":"e_1_3_2_1_2_1","unstructured":"2024. BAAI\/bge-reranker-v2-gemma \u2022 HuggingFace. https:\/\/huggingface.co\/BAAI\/bge-reranker-v2-gemma."},{"key":"e_1_3_2_1_3_1","unstructured":"2024. BAAI\/bge-reranker-v2-m3 \u2022 HuggingFace. https:\/\/huggingface.co\/BAAI\/bge-reranker-v2-m3."},{"key":"e_1_3_2_1_4_1","unstructured":"2024. BAAI\/bge-reranker-v2-minicpm-layerwise \u2022 HuggingFace. https:\/\/huggingface.co\/BAAI\/bge-reranker-v2-minicpm-layerwise."},{"key":"e_1_3_2_1_5_1","unstructured":"2025. Global interpreter Lock - Python Wiki. https:\/\/wiki.python.org\/moin\/GlobalInterpreterLock"},{"key":"e_1_3_2_1_6_1","unstructured":"2025. IPADS-SAI\/MobiAgent: The Intelligent GUI agent for mobile Phones. https:\/\/github.com\/IPADS-SAI\/MobiAgent"},{"key":"e_1_3_2_1_7_1","unstructured":"2025. IPADS-SAI\/MobiMind-Decider-7B \u2022 HuggingFace. https:\/\/huggingface.co\/IPADS-SAI\/MobiMind-Decider-7B"},{"key":"e_1_3_2_1_8_1","unstructured":"2025. libuv\/libuv: Cross-platform asynchronous I\/O. https:\/\/github.com\/libuv\/libuv"},{"key":"e_1_3_2_1_9_1","unstructured":"2025. Magic Cue on Pixel 10 Series Phones: smart contextual assistance across apps emails & more. https:\/\/store.google.com\/intl\/en\/ideas\/articles\/magic-cue\/."},{"key":"e_1_3_2_1_10_1","volume-title":"Huawei HarmonyOS Developer. https:\/\/developer.huawei.com\/consumer\/cn\/doc\/harmonyos-guides\/performance-memory-usage [Online","author":"Usage Memory","year":"2025","unstructured":"2025. Memory Usage, Huawei HarmonyOS Developer. https:\/\/developer.huawei.com\/consumer\/cn\/doc\/harmonyos-guides\/performance-memory-usage [Online; accessed 2025-09-18]."},{"key":"e_1_3_2_1_11_1","unstructured":"2025. Qwen\/Qwen3-Reranker-8B \u2022 HuggingFace. https:\/\/huggingface.co\/Qwen\/Qwen3-Reranker-8B"},{"key":"e_1_3_2_1_12_1","unstructured":"2025. Rerank | Boost Enterprise Search and Retrieval | Cohere. https:\/\/cohere.com\/rerank."},{"key":"e_1_3_2_1_13_1","volume-title":"Reranking for Vertex AI RAG Engine | Generative AI on Vertex AI | Google Cloud. https:\/\/cloud.google.com\/vertex-ai\/generative-ai\/docs\/rag-engine\/retrieval-and-ranking [Online","year":"2025","unstructured":"2025. Reranking for Vertex AI RAG Engine | Generative AI on Vertex AI | Google Cloud. https:\/\/cloud.google.com\/vertex-ai\/generative-ai\/docs\/rag-engine\/retrieval-and-ranking [Online; accessed 2025-09-12]."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation","author":"Agrawal Amey","year":"2025","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S. Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2025. Taming throughput-latency tradeoff in LLM inference with sarathi-serve. In Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation (Santa Clara, CA, USA) (OSDI'24). USENIX Association, USA, Article 7, 18 pages."},{"key":"e_1_3_2_1_15_1","unstructured":"Avinashsingh. 2025. [Feature]: Add support for Apple MPS(Metal Performance Shaders). https:\/\/github.com\/vllm-project\/vllm\/issues\/22629"},{"key":"e_1_3_2_1_16_1","volume-title":"LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-context Multitasks. arXiv preprint arXiv:2412.15204","author":"Bai Yushi","year":"2024","unstructured":"Yushi Bai, Shangqing Tu, Jiajie Zhang, Hao Peng, Xiaozhi Wang, Xin Lv, Shulin Cao, Jiazheng Xu, Lei Hou, Yuxiao Dong, Jie Tang, and Juanzi Li. 2024. LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-context Multitasks. arXiv preprint arXiv:2412.15204 (2024)."},{"key":"e_1_3_2_1_17_1","unstructured":"Beir-Cellar. 2025. beir-cellar\/beir: A Heterogeneous Benchmark for Information Retrieval. Easy to use evaluate your models across 15+ diverse IR datasets. https:\/\/github.com\/beir-cellar\/beir"},{"key":"e_1_3_2_1_18_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. https:\/\/arxiv.org\/abs\/2005.14165. arXiv:2005.14165 [cs.CL]"},{"key":"e_1_3_2_1_19_1","volume-title":"EfficientQAT: Efficient Quantization-Aware Training for Large Language Models. arXiv preprint arXiv:2407.11062","author":"Chen Mengzhao","year":"2024","unstructured":"Mengzhao Chen, Wenqi Shao, Peng Xu, Jiahao Wang, Peng Gao, Kaipeng Zhang, Yu Qiao, and Ping Luo. 2024. EfficientQAT: Efficient Quantization-Aware Training for Large Language Models. arXiv preprint arXiv:2407.11062 (2024)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Prateek Chhikara Dev Khant Saket Aryan Taranjeet Singh and Deshraj Yadav. 2025. Mem0: Building Production-Ready AI Agents with Scalable Long-Term Memory. arXiv:2504.19413 [cs.CL] https:\/\/arxiv.org\/abs\/2504.19413","DOI":"10.3233\/FAIA251160"},{"key":"e_1_3_2_1_21_1","unstructured":"Cohere. 2024. Introducing Rerank 3 on Microsoft Azure AI | Cohere blog. https:\/\/cohere.com\/blog\/introducing-rerank-3-on-microsoft-azure-ai."},{"key":"e_1_3_2_1_22_1","unstructured":"Wikipedia contributors. 2022. Goodman and Kruskal's gamma. https:\/\/en.wikipedia.org\/wiki\/Goodman_and_Kruskal%27s_gamma"},{"key":"e_1_3_2_1_23_1","unstructured":"Wikipedia contributors. 2025. Coefficient of variation. https:\/\/en.wikipedia.org\/wiki\/Coefficient_of_variation"},{"key":"e_1_3_2_1_24_1","unstructured":"Wikipedia contributors. 2025. Global interpreter lock. https:\/\/en.wikipedia.org\/wiki\/Global_interpreter_lock"},{"key":"e_1_3_2_1_25_1","unstructured":"Mike Darling. 2025. 4 ways Pixel's Magic Cue can help you save time. https:\/\/blog.google\/products\/pixel\/google-pixel-magic-cue-ai-feature\/."},{"key":"e_1_3_2_1_26_1","unstructured":"Gabriel de Souza P. Moreira Ronay Ak Benedikt Schifferer Mengyao Xu Radek Osmulski and Even Oldridge. 2024. Enhancing Q&A Text Retrieval with Ranking Models: Benchmarking fine-tuning and deploying Rerankers for RAG. arXiv:2409.07691 [cs.IR] https:\/\/arxiv.org\/abs\/2409.07691"},{"key":"e_1_3_2_1_27_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. https:\/\/arxiv.org\/abs\/1810.04805. arXiv:1810.04805 [cs.CL]","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. https:\/\/arxiv.org\/abs\/1810.04805. arXiv:1810.04805 [cs.CL]"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles (Lotte Hotel World","author":"Du Kuntai","year":"2025","unstructured":"Kuntai Du, Bowen Wang, Chen Zhang, Yiming Cheng, Qing Lan, Hejian Sang, Yihua Cheng, Jiayi Yao, Xiaoxuan Liu, Yifan Qiao, Ion Stoica, and Junchen Jiang. 2025. PrefillOnly: An Inference Engine for Prefill-only Workloads in Large Language Model Applications. In Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles (Lotte Hotel World, Seoul, Republic of Korea) (SOSP '25). Association for Computing Machinery, New York, NY, USA, 399\u2013414. 10.1145\/3731569.3764834"},{"key":"e_1_3_2_1_29_1","volume-title":"GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. https:\/\/arxiv.org\/abs\/2210.17323. arXiv:2210.17323 [cs.LG]","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2023. GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. https:\/\/arxiv.org\/abs\/2210.17323. arXiv:2210.17323 [cs.LG]"},{"key":"e_1_3_2_1_30_1","unstructured":"Qichen Fu Minsik Cho Thomas Merth Sachin Mehta Mohammad Rastegari and Mahyar Najibi.2024. LazyLLM: Dynamic Token Pruning for Efficient Long Context LLM Inference. arXiv:2407.14057 [cs.CL] https:\/\/arxiv.org\/abs\/2407.14057"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.194"},{"key":"e_1_3_2_1_32_1","first-page":"100","article-title":"Algorithm AS 136: A k-means clustering algorithm","volume":"28","author":"Hartigan John A","year":"1979","unstructured":"John A Hartigan and Manchek A Wong. 1979. Algorithm AS 136: A k-means clustering algorithm. Journal of the royal statistical society. series c (applied statistics) 28, 1 (1979), 100\u2013108.","journal-title":"Journal of the royal statistical society. series c (applied statistics)"},{"key":"e_1_3_2_1_33_1","unstructured":"HuggingFace. 2025. Accelerate: A simple way to launch train and use PyTorch models on almost any device and distributed configuration automatic mixed precision (including fp8) and easy-to-configure FSDP and DeepSpeed support. https:\/\/github.com\/huggingface\/accelerate"},{"key":"e_1_3_2_1_34_1","unstructured":"HuggingFace. 2025. Transformers: the model-definition framework for state-of-the-art machine learning models in text vision audio and multimodal models for both inference and training. https:\/\/github.com\/huggingface\/transformers"},{"key":"e_1_3_2_1_35_1","first-page":"19","volume-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Anna Korhonen, David Traum, and Llu\u00eds M\u00e0rquez (Eds.). Association for Computational Linguistics","author":"Jawahar Ganesh","year":"2019","unstructured":"Ganesh Jawahar, Beno\u00eet Sagot, and Djam\u00e9 Seddah. 2019. What Does BERT Learn about the Structure of Language?. In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Anna Korhonen, David Traum, and Llu\u00eds M\u00e0rquez (Eds.). Association for Computational Linguistics, Florence, Italy, 3651\u20133657. 10.18653\/v1\/P19-1356"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.91"},{"key":"e_1_3_2_1_37_1","unstructured":"Jiazheng Kang Mingming Ji Zhe Zhao and Ting Bai. 2025. Memory OS of AI Agent. arXiv:2506.06326 [cs.AI] https:\/\/arxiv.org\/abs\/2506.06326"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"e_1_3_2_1_39_1","unstructured":"Khan-Yin. 2025. [Feature] Support Apple Silicon (M2\/M3...). https:\/\/github.com\/sgl-project\/sglang\/issues\/5767"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval. 39\u201348","author":"Khattab Omar","year":"2020","unstructured":"Omar Khattab and Matei Zaharia. 2020. ColBERT: Efficient and Effective Passage Search via Contextualized Late Interaction over BERT. In Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval. 39\u201348. 10.1145\/3397271.3401075"},{"key":"e_1_3_2_1_41_1","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph Gonzalez, Hao Zhang, and Ion Stoica. 2023. Efficient Memory Management for Large Language Model Serving with PagedAttention. In Proceedings of the 29th Symposium on Operating Systems Principles (Koblenz, Germany) (SOSP '23). Association for Computing Machinery, New York, NY, USA, 611\u2013626. 10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_42_1","unstructured":"Xunhao Lai Jianqiao Lu Yao Luo Yiyuan Ma and Xun Zhou. 2025. FlexPrefill: A Context-Aware Sparse Attention Mechanism for Efficient Long-Sequence Inference. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=OfjMllbelrT"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2020. Retrieval-augmented generation for knowledge-intensive NLP tasks. In Proceedings of the 34th International Conference on Neural Information Processing Systems (Vancouver, BC, Canada) (NIPS '20). Curran Associates Inc., Red Hook, NY, USA, Article 793, 16 pages."},{"key":"e_1_3_2_1_44_1","unstructured":"Jiaxing Li Chi Xu Lianchen Jia Feng Wang Cong Zhang and Jiangchuan Liu. 2025. EACO-RAG: Towards Distributed Tiered LLM Deployment using Edge-Assisted and Collaborative RAG with Adaptive Knowledge Update. arXiv:2410.20299 [cs.DC] https:\/\/arxiv.org\/abs\/2410.20299"},{"key":"e_1_3_2_1_45_1","volume-title":"AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. https:\/\/arxiv.org\/abs\/2306.00978. arXiv:2306.00978 [cs.CL]","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. 2024. AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. https:\/\/arxiv.org\/abs\/2306.00978. arXiv:2306.00978 [cs.CL]"},{"key":"e_1_3_2_1_46_1","unstructured":"Lingkun Long Rubing Yang Yushi Huang Desheng Hui Ao Zhou and Jianlei Yang. 2025. SlimInfer: Accelerating Long-Context LLM Inference via Dynamic Token Pruning. arXiv:2508.06447 [cs.CL] https:\/\/arxiv.org\/abs\/2508.06447"},{"key":"e_1_3_2_1_47_1","volume-title":"LLM-Pruner: On the Structural Pruning of Large Language Models. In Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=J8Ajf9WfXP","author":"Ma Xinyin","year":"2023","unstructured":"Xinyin Ma, Gongfan Fang, and Xinchao Wang. 2023. LLM-Pruner: On the Structural Pruning of Large Language Models. In Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=J8Ajf9WfXP"},{"key":"e_1_3_2_1_48_1","unstructured":"Milvus. 2024. Milvus | The High-Performance Vector Database built for Scale. https:\/\/milvus.io\/."},{"key":"e_1_3_2_1_49_1","unstructured":"NVIDIA. 2025. NVIDIA Multi-Process Service. https:\/\/docs.nvidia.com\/deploy\/mps\/index.html"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Authors of BitNet b1.58. 2025. BitNet b1.58: 1.58-bit Large Language Models. arXiv:2504.12285 [cs.CL] https:\/\/arxiv.org\/abs\/2504.12285","DOI":"10.1017\/S0261444825100839"},{"key":"e_1_3_2_1_51_1","unstructured":"PyTorch. 2025. Multiprocessing package - torch.multiprocessing. https:\/\/docs.pytorch.org\/docs\/2.8\/multiprocessing.html"},{"key":"e_1_3_2_1_52_1","unstructured":"Yuwei Ren Yuhui Ding Lijun Wu Shujian Huang Lei Li and Qun Liu. 2024. BitNet a4.8: 1-bit Weight 4-bit Activation LLMs. arXiv:2411.04965 [cs.CL] https:\/\/arxiv.org\/abs\/2411.04965"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.225"},{"key":"e_1_3_2_1_54_1","unstructured":"Victor Sanh Lysandre Debut Julien Chaumond and Thomas Wolf. 2019. DistilBERT a distilled version of BERT: smaller faster cheaper and lighter. arXiv:1910.01108 [cs.CL] https:\/\/arxiv.org\/abs\/1910.01108"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.272"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Yixin Song Zeyu Mi Haotong Xie and Haibo Chen. 2024. PowerInfer: Fast Large Language Model Serving with a Consumer-grade GPU. arXiv:2312.12456 [cs.LG] https:\/\/arxiv.org\/abs\/2312.12456","DOI":"10.1145\/3694715.3695964"},{"key":"e_1_3_2_1_57_1","unstructured":"Nandan Thakur Nils Reimers Andreas R\u00fcckl\u00e9 Abhishek Srivastava and Iryna Gurevych. 2021. BEIR: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2). https:\/\/openreview.net\/forum?id=wCu6T5xFjeJ"},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Proceedings of the 31st International Conference on Neural Information Processing Systems (Long Beach, California, USA) (NIPS'17). Curran Associates Inc., Red Hook, NY, USA, 6000\u20136010."},{"key":"e_1_3_2_1_59_1","volume-title":"Proceedings of the 2021 International Conference on Management of Data","author":"Wang Jianguo","year":"2021","unstructured":"Jianguo Wang, Xiaomeng Yi, Rentong Guo, Hai Jin, Peng Xu, Shengjun Li, Xiangyu Wang, Xiangzhou Guo, Chengming Li, Xiaohai Xu, Kun Yu, Yuxing Yuan, Yinghao Zou, Jiquan Long, Yudong Cai, Zhenxiang Li, Zhifeng Zhang, Yihua Mo, Jun Gu, Ruiyi Jiang, Yi Wei, and Charles Xie. 2021. Milvus: A Purpose-Built Vector Data Management System. In Proceedings of the 2021 International Conference on Management of Data (Virtual Event, China) (SIGMOD '21). Association for Computing Machinery, New York, NY, USA, 2614\u20132627. 10.1145\/3448016.3457550"},{"key":"e_1_3_2_1_60_1","volume-title":"Gonzalez","author":"Wang Yichuan","year":"2025","unstructured":"Yichuan Wang, Shu Liu, Zhifei Li, Yongji Wu, Ziming Mao, Yilong Zhao, Xiao Yan, Zhiying Xu, Yang Zhou, Ion Stoica, Sewon Min, Matei Zaharia, and Joseph E. Gonzalez. 2025. LEANN: A Low-Storage Vector Index. arXiv:2506.08276 [cs.DB] https:\/\/arxiv.org\/abs\/2506.08276"},{"key":"e_1_3_2_1_61_1","volume-title":"Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval (Washington DC, USA) (SIGIR '24)","author":"Zijie","unstructured":"Zijie J. Wang and Duen Horng Chau. 2024. MeMemo: On-device Retrieval Augmentation for Private and Personalized Text Generation. In Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval (Washington DC, USA) (SIGIR '24). Association for Computing Machinery, New York, NY, USA, 2765\u20132770. 10.1145\/3626772.3657662"},{"key":"e_1_3_2_1_62_1","volume-title":"Findings of the Association for Computational Linguistics: NAACL 2025","author":"Wang Zora Zhiruo","year":"2025","unstructured":"Zora Zhiruo Wang, Akari Asai, Xinyan Velocity Yu, Frank F. Xu, Yiqing Xie, Graham Neubig, and Daniel Fried. 2025. CodeRAG-Bench: Can Retrieval Augment Code Generation?. In Findings of the Association for Computational Linguistics: NAACL 2025, Luis Chiruzzo, Alan Ritter, and Lu Wang (Eds.). Association for Computational Linguistics, Albuquerque, New Mexico, 3199\u20133214. 10.18653\/v1\/2025.findings-naacl.176"},{"key":"e_1_3_2_1_63_1","volume-title":"Agent Workflow Memory. In Forty-second International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=NTAhi2JEEE","author":"Wang Zora Zhiruo","year":"2025","unstructured":"Zora Zhiruo Wang, Jiayuan Mao, Daniel Fried, and Graham Neubig. 2025. Agent Workflow Memory. In Forty-second International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=NTAhi2JEEE"},{"key":"e_1_3_2_1_64_1","unstructured":"Orion Weller Michael Boratko Iftekhar Naim and Jinhyuk Lee. 2025. On the Theoretical Limitations of Embedding-Based Retrieval. arXiv:2508.21038 [cs.IR] https:\/\/arxiv.org\/abs\/2508.21038"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-industry.11"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"crossref","unstructured":"Ji Xin Raphael Tang Jaejun Lee Yaoliang Yu and Jimmy Lin. 2020. DeeBERT: Dynamic Early Exiting for Accelerating BERT Inference. arXiv:2004.12993 [cs.CL] https:\/\/arxiv.org\/abs\/2004.12993","DOI":"10.18653\/v1\/2020.acl-main.204"},{"key":"e_1_3_2_1_67_1","unstructured":"Zhenliang Xue Yixin Song Zeyu Mi Xinrui Zheng Yubin Xia and Haibo Chen. 2024. PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv:2406.06282 [cs.LG] https:\/\/arxiv.org\/abs\/2406.06282"},{"key":"e_1_3_2_1_68_1","unstructured":"Cheng Zhang Erhu Feng Xi Zhao Yisheng Zhao Wangbo Gong Jiahui Sun Dong Du Zhichao Hua Yubin Xia and Haibo Chen. 2025. MobiAgent: A Systematic Framework for Customizable Mobile Agents. arXiv:2509.00531 [cs.MA] https:\/\/arxiv.org\/abs\/2509.00531"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2412.03131"},{"key":"e_1_3_2_1_70_1","unstructured":"Yanzhao Zhang Mingxin Li Dingkun Long Xin Zhang Huan Lin Baosong Yang Pengjun Xie An Yang Dayiheng Liu Junyang Lin Fei Huang and Jingren Zhou. 2025. Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models. https:\/\/arxiv.org\/abs\/2506.05176. arXiv:2506.05176 [cs.CL]"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637870"},{"key":"e_1_3_2_1_72_1","unstructured":"George Kingsley Zipf. 1949. Human behavior and the principle of least effort. (1949)."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"deposited":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:28:54Z","timestamp":1777062534000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3803572"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":72,"alternative-id":["10.1145\/3767295.3803572","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3803572","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}