{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T17:06:55Z","timestamp":1775840815978,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","funder":[{"name":"National Key Research and Development Program of China Grant","award":["2023YFF0725100"],"award-info":[{"award-number":["2023YFF0725100"]}]},{"name":"NSFC under Grant","award":["U22B2060"],"award-info":[{"award-number":["U22B2060"]}]},{"name":"Guangdong-Hong Kong Technology Innovation Joint Funding Scheme Project","award":["2024A0505040012"],"award-info":[{"award-number":["2024A0505040012"]}]},{"name":"the Hong Kong RGC GRF Project","award":["16213620"],"award-info":[{"award-number":["16213620"]}]},{"name":"RIF Project","award":["R6020-19"],"award-info":[{"award-number":["R6020-19"]}]},{"name":"AOE Project","award":["AoE&#x5c;&#x2f;E-603&#x5c;&#x2f;18"],"award-info":[{"award-number":["AoE&#x5c;&#x2f;E-603&#x5c;&#x2f;18"]}]},{"name":"Theme-based project","award":["TRS T41-603&#x5c;&#x2f;20R"],"award-info":[{"award-number":["TRS T41-603&#x5c;&#x2f;20R"]}]},{"name":"CRF Project","award":["C2004-21G"],"award-info":[{"award-number":["C2004-21G"]}]},{"name":"Guangdong Province Science and Technology Plan Project","award":["2023A0505030011"],"award-info":[{"award-number":["2023A0505030011"]}]},{"name":"Guangzhou municipality big data intelligence key lab","award":["2023A03J0012"],"award-info":[{"award-number":["2023A03J0012"]}]},{"name":"Hong Kong ITC ITF grants","award":["MHX&#x5c;&#x2f;078&#x5c;&#x2f;21 and PRP&#x5c;&#x2f;004&#x5c;&#x2f;22FX"],"award-info":[{"award-number":["MHX&#x5c;&#x2f;078&#x5c;&#x2f;21 and PRP&#x5c;&#x2f;004&#x5c;&#x2f;22FX"]}]},{"name":"Zhujiang scholar program","award":["2021JC02X170"],"award-info":[{"award-number":["2021JC02X170"]}]},{"name":"Guangdong Basic and Applied Basic Research Foundation","award":["2025A1515010304"],"award-info":[{"award-number":["2025A1515010304"]}]},{"name":"Guangzhou Science and Technology Planning Project","award":["2025A03J4491"],"award-info":[{"award-number":["2025A03J4491"]}]},{"name":"Yunnan Key Research Program","award":["202402AD080004"],"award-info":[{"award-number":["202402AD080004"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3774904.3792794","type":"proceedings-article","created":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T21:54:39Z","timestamp":1775771679000},"page":"8127-8137","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Probe-and-Fetch: Dynamic KV Cache Pruning for Accelerated Long-Context Inference in Web-Scale AI Search"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3869-7881","authenticated-orcid":false,"given":"Yuchen","family":"Li","sequence":"first","affiliation":[{"name":"Baidu Inc., Bejing, China and Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2889-2266","authenticated-orcid":false,"given":"Rui","family":"Kong","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9071-1933","authenticated-orcid":false,"given":"Xinran","family":"Chen","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4830-7144","authenticated-orcid":false,"given":"Chengzhe","family":"Zhang","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6674-7764","authenticated-orcid":false,"given":"Jiamin","family":"Chen","sequence":"additional","affiliation":[{"name":"City University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3171-823X","authenticated-orcid":false,"given":"Cheng","family":"Deng","sequence":"additional","affiliation":[{"name":"University of Edinburgh, Edinburgh, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5511-9370","authenticated-orcid":false,"given":"Xinyu","family":"Ma","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4662-1212","authenticated-orcid":false,"given":"Haojie","family":"Zhang","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9910-2298","authenticated-orcid":false,"given":"Tianhao","family":"Peng","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7147-5666","authenticated-orcid":false,"given":"Hengyi","family":"Cai","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9212-1947","authenticated-orcid":false,"given":"Shuaiqiang","family":"Wang","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9770-7616","authenticated-orcid":false,"given":"Jiashu","family":"Zhao","sequence":"additional","affiliation":[{"name":"Wilfrid Laurier University, Waterloo, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2085-7418","authenticated-orcid":false,"given":"Yongqi","family":"Zhang","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5451-3253","authenticated-orcid":false,"given":"Haoyi","family":"Xiong","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1292-1491","authenticated-orcid":false,"given":"Jimmy Xiangji","family":"Huang","sequence":"additional","affiliation":[{"name":"York University, Toronto, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8257-5806","authenticated-orcid":false,"given":"Lei","family":"Chen","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4021-4228","authenticated-orcid":false,"given":"Jun","family":"Wang","sequence":"additional","affiliation":[{"name":"University College London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0684-6205","authenticated-orcid":false,"given":"Dawei","family":"Yin","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2026,4,12]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Sandhini Agarwal Lama Ahmad Jason Ai Sam Altman Andy Applebaum Edwin Arbus Rahul K Arora Yu Bai Bowen Baker Haiming Bao et al. 2025. Gpt-Oss-120b & Gpt-Oss-20b Model Card. ArXiv preprint arXiv:2508.10925 (2025)."},{"key":"e_1_3_2_1_2_1","volume-title":"Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads. In International Conference on Machine Learning (ICML).","author":"Cai Tianle","year":"2024","unstructured":"Tianle Cai, Yuhong Li, Zhengyang Geng, Hongwu Peng, Jason D. Lee, Deming Chen, and Tri Dao. 2024. Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_3_1","volume-title":"Accelerating Large Language Model Decoding with Speculative Sampling. ArXiv preprint arXiv:2302.01318","author":"Chen Charlie","year":"2023","unstructured":"Charlie Chen, Sebastian Borgeaud, Geoffrey Irving, Jean-Baptiste Lespiau, Laurent Sifre, and John Jumper. 2023. Accelerating Large Language Model Decoding with Speculative Sampling. ArXiv preprint arXiv:2302.01318 (2023)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3737249"},{"key":"e_1_3_2_1_5_1","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Chen Zhuoming","year":"2024","unstructured":"Zhuoming Chen, Avner May, Ruslan Svirschevski, Yuhsun Huang, Max Ryabinin, Zhihao Jia, and Beidi Chen. 2024a. Sequoia: Scalable, Robust, and Hardware-Aware Speculative Decoding. In Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_6_1","volume-title":"Cascade Speculative Drafting for Even Faster LLM Inference. In Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Chen Ziyi","year":"2024","unstructured":"Ziyi Chen, Xiaocong Yang, Jiacheng Lin, Chenkai Sun, Kevin Chen-Chuan Chang, and Jie Huang. 2024b. Cascade Speculative Drafting for Even Faster LLM Inference. In Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_7_1","volume-title":"Layer Skip: Enabling Early Exit Inference and Self-Speculative Decoding. In Annual Meeting of the Association for Computational Linguistics (ACL).","author":"Elhoushi Mostafa","year":"2024","unstructured":"Mostafa Elhoushi, Akshat Shrivastava, Diana Liskovich, Basil Hosmer, Bram Wasti, Liangzhen Lai, Anas Mahmoud, Bilge Acun, Saurabh Agarwal, Ahmed Roman, et al., 2024. Layer Skip: Enabling Early Exit Inference and Self-Speculative Decoding. In Annual Meeting of the Association for Computational Linguistics (ACL)."},{"key":"e_1_3_2_1_8_1","volume-title":"Break the Sequential Dependency of LLM Inference Using Lookahead Decoding. In International Conference on Machine Learning (ICML).","author":"Fu Yichao","year":"2024","unstructured":"Yichao Fu, Peter Bailis, Ion Stoica, and Hao Zhang. 2024. Break the Sequential Dependency of LLM Inference Using Lookahead Decoding. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Ge Suyu","year":"2024","unstructured":"Suyu Ge, Yunan Zhang, Liyuan Liu, Minjia Zhang, Jiawei Han, and Jianfeng Gao. 2024. Model Tells You What to Discard: Adaptive KV Cache Compression for LLMs. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_10_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The Llama 3 Herd of Models. ArXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"InfiniGen: Efficient Generative Inference of Large Language Models with Dynamic KV Cache Management. In USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Lee Wonbeom","year":"2024","unstructured":"Wonbeom Lee, Jungi Lee, Junghwan Seo, and Jaewoong Sim. 2024. InfiniGen: Efficient Generative Inference of Large Language Models with Dynamic KV Cache Management. In USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_12_1","volume-title":"International Conference on Machine Learning (ICML).","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. 2023. Fast Inference from Transformers via Speculative Decoding. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_13_1","volume-title":"Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al., 2020. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_14_1","unstructured":"Yuchen Li Hengyi Cai Rui Kong Xinran Chen Jiamin Chen Jun Yang Haojie Zhang Jiayi Li Jiayi Wu Yiqun Chen et al. 2025a. Towards AI Search Paradigm. ArXiv preprint arXiv:2506.17188 (2025)."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 32nd ACM SIGKDD Conference on Knowledge Discovery and Data Mining V. 1. 1-11","author":"Li Yuchen","year":"2026","unstructured":"Yuchen Li, Jiamin Chen, Xinran Chen, Zhiyu Li, Haojie Zhang, Rui Kong, Jiayi Li, Xinyu Ma, Hengyi Cai, Lixin Su, Shuaiqiang Wang, Jiashu Zhao, Yongqi Zhang, Haoyi Xiong, Linghe Kong, Lei Chen, and Dawei Yin. 2026 a. Retain to Refine: Agent-Based Adaptive Question Answering via Query Routing and Long-Short Memory. In Proceedings of the 32nd ACM SIGKDD Conference on Knowledge Discovery and Data Mining V. 1. 1-11."},{"key":"e_1_3_2_1_16_1","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Li Yuhong","year":"2024","unstructured":"Yuhong Li, Yingbing Huang, Bowen Yang, Bharat Venkitesh, Acyr Locatelli, Hanchen Ye, Tianle Cai, Patrick Lewis, and Deming Chen. 2024a. SnapKV: LLM Knows What You are Looking for Before Generation. In Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_17_1","volume-title":"FlexSpec: Frozen Drafts Meet Evolving Targets in Edge-Cloud Collaborative LLM Speculative Decoding. arXiv preprint arXiv:2601","author":"Li Yuchen","year":"2026","unstructured":"Yuchen Li, Rui Kong, Zhonghao Lyu, Qiyang Li, Xinran Chen, Hengyi Cai, Lingyong Yan, Shuaiqiang Wang, Jiashu Zhao, Guangxu Zhu, et al., 2026 b. FlexSpec: Frozen Drafts Meet Evolving Targets in Edge-Cloud Collaborative LLM Speculative Decoding. arXiv preprint arXiv:2601.00644 (2026)."},{"key":"e_1_3_2_1_18_1","volume-title":"S3PRank: Towards Satisfaction-oriented Learning to Rank with Semi-supervised Pre-training","author":"Li Yuchen","year":"2025","unstructured":"Yuchen Li, Zhonghao Lyu, Yongqi Zhang, Hao Zhang, Tianhao Peng, Haoyi Xiong, Shuaiqiang Wang, Linghe Kong, Guihai Chen, and Dawei Yin. 2025b. S3PRank: Towards Satisfaction-oriented Learning to Rank with Semi-supervised Pre-training. IEEE Transactions on Knowledge and Data Engineering (2025)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.422"},{"key":"e_1_3_2_1_20_1","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Li Yuhui","year":"2025","unstructured":"Yuhui Li, Fangyun Wei, Chao Zhang, and Hongyang Zhang. 2025c. EAGLE-3: Scaling up Inference Acceleration of Large Language Models via Training-Time Test. In Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSC.2023.3325302"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2023.3270750"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3737443"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3737258"},{"key":"e_1_3_2_1_25_1","volume-title":"2025 IEEE 41st International Conference on Data Engineering (ICDE). IEEE, 4441-4454","author":"Li Yuchen","year":"2025","unstructured":"Yuchen Li, Hao Zhang, Yongqi Zhang, Xinyu Ma, Wenwen Ye, Naifei Song, Shuaiqiang Wang, Haoyi Xiong, Dawei Yin, and Lei Chen. 2025 f. M 2 oERank: Multi-Objective Mixture-of-Experts Enhanced Ranking for Satisfaction-Oriented Web Search. In 2025 IEEE 41st International Conference on Data Engineering (ICDE). IEEE, 4441-4454."},{"key":"e_1_3_2_1_26_1","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Aditya Desai, Fangshuo Liao, Weitao Wang, Victor Xie, Zhaozhuo Xu, Anastasios Kyrillidis, and Anshumali Shrivastava. 2023. Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression at Test Time. In Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_27_1","volume-title":"Alan Zhu, Lijie Yang, Xiaoxiang Shi, et al.","author":"Miao Xupeng","year":"2024","unstructured":"Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Zeyu Wang, Zhengxin Zhang, Rae Ying Yee Wong, Alan Zhu, Lijie Yang, Xiaoxiang Shi, et al., 2024. Specinfer: Accelerating Large Language Model Serving with Tree-Based Speculative Inference and Verification. In ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)."},{"key":"e_1_3_2_1_28_1","volume-title":"Dynamic Memory Compression: Retrofitting LLMs for Accelerated Inference. In International Conference on Machine Learning (ICML).","author":"Nawrot Piotr","unstructured":"Piotr Nawrot, Adrian \u0141a\u0144cucki, Marcin Chochowski, David Tarjan, and Edoardo M. Ponti. 2024. Dynamic Memory Compression: Retrofitting LLMs for Accelerated Inference. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_29_1","volume-title":"Blockwise Parallel Decoding for Deep Autoregressive Models. In Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Stern Mitchell","year":"2018","unstructured":"Mitchell Stern, Noam Shazeer, and Jakob Uszkoreit. 2018. Blockwise Parallel Decoding for Deep Autoregressive Models. In Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_30_1","volume-title":"Triforce: Lossless Acceleration of Long Sequence Generation with Hierarchical Speculative Decoding. In Conference on Language Modeling (COLM).","author":"Sun Hanshi","year":"2024","unstructured":"Hanshi Sun, Zhuoming Chen, Xinyu Yang, Yuandong Tian, and Beidi Chen. 2024a. Triforce: Lossless Acceleration of Long Sequence Generation with Hierarchical Speculative Decoding. In Conference on Language Modeling (COLM)."},{"key":"e_1_3_2_1_31_1","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Sun Ziteng","year":"2024","unstructured":"Ziteng Sun, Ananda Theertha Suresh, Jae Hun Ro, Ahmad Beirami, Himanshu Jain, and Felix Yu. 2024b. Spectr: Fast Speculative Decoding via Optimal Transport. In Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_32_1","volume-title":"QUEST: Query-Aware Sparsity for Efficient Long-Context LLM Inference. In International Conference on Machine Learning (ICML).","author":"Tang Jiaming","year":"2024","unstructured":"Jiaming Tang, Yilong Zhao, Kan Zhu, Guangxuan Xiao, Baris Kasikci, and Song Han. 2024. QUEST: Query-Aware Sparsity for Efficient Long-Context LLM Inference. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_33_1","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All You Need. In Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_34_1","volume-title":"Efficient Streaming Language Models with Attention Sinks. In International Conference on Learning Representations (ICLR).","author":"Xiao Guangxuan","year":"2024","unstructured":"Guangxuan Xiao, Yuandong Tian, Beidi Chen, Song Han, and Mike Lewis. 2024. Efficient Streaming Language Models with Attention Sinks. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_35_1","volume-title":"When search engine services meet large language models: visions and challenges","author":"Xiong Haoyi","year":"2024","unstructured":"Haoyi Xiong, Jiang Bian, Yuchen Li, Xuhong Li, Mengnan Du, Shuaiqiang Wang, Dawei Yin, and Sumi Helal. 2024. When search engine services meet large language models: visions and challenges. IEEE Transactions on Services Computing (2024)."},{"key":"e_1_3_2_1_36_1","unstructured":"Peng Xu Wei Ping Xianchao Wu Zihan Liu Mohammad Shoeybi and Bryan Catanzaro. 2024. ChatQA 2: Bridging the Gap to Proprietary LLMs in Long Context and RAG Capabilities. ArXiv preprint arXiv:2407.14482."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.607"},{"key":"e_1_3_2_1_38_1","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Zhang Zhenyu","year":"2023","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher Re, Clark Barrett, Zhangyang Wang, and Beidi Chen. 2023. H2O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models. In Annual Conference on Neural Information Processing Systems (NeurIPS)."}],"event":{"name":"WWW '26: The ACM Web Conference 2026","location":"Dubai United Arab Emirates","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2026"],"original-title":[],"deposited":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T16:19:34Z","timestamp":1775837974000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774904.3792794"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,12]]},"references-count":38,"alternative-id":["10.1145\/3774904.3792794","10.1145\/3774904"],"URL":"https:\/\/doi.org\/10.1145\/3774904.3792794","relation":{},"subject":[],"published":{"date-parts":[[2026,4,12]]},"assertion":[{"value":"2026-04-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}