{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,10]],"date-time":"2025-10-10T01:24:14Z","timestamp":1760059454298,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,12]]},"DOI":"10.1145\/3725783.3764393","type":"proceedings-article","created":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T17:50:12Z","timestamp":1760032212000},"page":"46-53","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["TARDIS: A GPU-Centric KV Cache Service for Efficient LLM Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-6501-5659","authenticated-orcid":false,"given":"Yifan","family":"Hu","sequence":"first","affiliation":[{"name":"Xiamen University, Xiamen, Fujian, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0821-0646","authenticated-orcid":false,"given":"Shi","family":"Qiu","sequence":"additional","affiliation":[{"name":"Xiamen University, Xiamen, Fujian, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4279-9395","authenticated-orcid":false,"given":"Jianqin","family":"Yan","sequence":"additional","affiliation":[{"name":"Xiamen University, Xiamen, Fujian, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7141-5903","authenticated-orcid":false,"given":"Hao","family":"Chen","sequence":"additional","affiliation":[{"name":"Xiamen University, Xiamen, Fujian, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5200-777X","authenticated-orcid":false,"given":"Xintao","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2923-6247","authenticated-orcid":false,"given":"Tang","family":"Lu","sequence":"additional","affiliation":[{"name":"Xiamen University, Xiamen, Fujian, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1617-3593","authenticated-orcid":false,"given":"Guangtao","family":"Xue","sequence":"additional","affiliation":[{"name":"SJTU, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6450-8485","authenticated-orcid":false,"given":"Yiming","family":"Zhang","sequence":"additional","affiliation":[{"name":"XMU &amp; SJTU, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,11]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Mnemosyne: Parallelization strategies for efficiently serving multi-million context length llm inference requests without approximations. arXiv preprint arXiv:2409.17264","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Junda Chen, \u00cd\u00f1igo Goiri, Ramachandran Ramjee, Chaojie Zhang, Alexey Tumanov, and Esha Choukse. 2024. Mnemosyne: Parallelization strategies for efficiently serving multi-million context length llm inference requests without approximations. arXiv preprint arXiv:2409.17264 (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"3","author":"Chang Chia-Hao","year":"2024","unstructured":"Chia-Hao Chang, Jihoon Han, Anand Sivasubramaniam, Vikram Sharma Mailthody, Zaid Qureshi, and Wen-Mei Hwu. 2024. GMT: GPU Orchestrated Memory Tiering for the Big Data Era. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3. 464\u2013478."},{"key":"e_1_3_2_1_4_1","volume-title":"23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Chen Menglei","year":"2025","unstructured":"Menglei Chen, Yu Hua, Zhangyu Chen, Ming Zhang, and Gen Dong. 2025. {GPHash}: An Efficient Hash Index for {GPU} with {Byte-Granularity} Persistent Memory. In 23rd USENIX Conference on File and Storage Technologies (FAST 25). 203\u2013220."},{"key":"e_1_3_2_1_5_1","unstructured":"datacrunch.io. 2025. DeepSeek-V3 + SGLang: Inference Optimization. https:\/\/datacrunch.io\/blog\/deepseek-v3-sglang-inference-optimization."},{"key":"e_1_3_2_1_6_1","unstructured":"Deepseek. 2025. Models and Pricing. https:\/\/api-docs.deepseek.com\/quick_start\/pricing\/."},{"key":"e_1_3_2_1_7_1","volume-title":"Boosting Performance of Iterative Applications on GPUs: Kernel Batching with CUDA Graphs. arXiv preprint arXiv:2501.09398","author":"Ekelund Jonah","year":"2025","unstructured":"Jonah Ekelund, Stefano Markidis, and Ivy Peng. 2025. Boosting Performance of Iterative Applications on GPUs: Kernel Batching with CUDA Graphs. arXiv preprint arXiv:2501.09398 (2025)."},{"key":"e_1_3_2_1_8_1","volume-title":"Attention-Store: Cost-effective Attention Reuse across Multi-turn Conversations in Large Language Model Serving. arXiv preprint arXiv:2403.19708","author":"Gao Bin","year":"2024","unstructured":"Bin Gao, Zhuomin He, Puru Sharma, Qingxuan Kang, Djordje Jevdjic, Junbo Deng, Xingkun Yang, Zhou Yu, and Pengfei Zuo. 2024. Attention-Store: Cost-effective Attention Reuse across Multi-turn Conversations in Large Language Model Serving. arXiv preprint arXiv:2403.19708 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Gao Bin","year":"2024","unstructured":"Bin Gao, Zhuomin He, Puru Sharma, Qingxuan Kang, Djordje Jevdjic, Junbo Deng, Xingkun Yang, Zhou Yu, and Pengfei Zuo. 2024. {Cost-Efficient} large language model serving for multi-turn conversations with {CachedAttention}. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). 111\u2013126."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696072"},{"key":"e_1_3_2_1_11_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_12_1","first-page":"1270","article-title":"Kvquant: Towards 10 million context length llm inference with kv cache quantization","volume":"37","author":"Hooper Coleman","year":"2024","unstructured":"Coleman Hooper, Sehoon Kim, Hiva Mohammadzadeh, Michael W Mahoney, Sophia Shao, Kurt Keutzer, and Amir Gholami. 2024. Kvquant: Towards 10 million context length llm inference with kv cache quantization. Advances in Neural Information Processing Systems 37 (2024), 1270\u20131303.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_15_1","unstructured":"Aixin Liu Bei Feng Bin Wang Bingxuan Wang Bo Liu Chenggang Zhao Chengqi Dengr Chong Ruan Damai Dai Daya Guo et al. 2024. Deepseek-v2: A strong economical and efficient mixture-of-experts language model. arXiv preprint arXiv:2405.04434 (2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672274"},{"key":"e_1_3_2_1_17_1","unstructured":"Nvidia. 2024. NVIDIA GPUDirect Storage. https:\/\/docs.nvidia.com\/gpudirect-storage\/index.html."},{"key":"e_1_3_2_1_18_1","unstructured":"NVIDIA. 2025. Creating a Graph Using Graph APIs. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html."},{"key":"e_1_3_2_1_19_1","unstructured":"NVIDIA. 2025. NVIDIA Dynamo. https:\/\/github.com\/ai-dynamo\/dynamo\/."},{"key":"e_1_3_2_1_20_1","volume-title":"Mooncake: Kimi's KVCache-centric Architecture for LLM Serving. arXiv preprint arXiv:2407.00079","author":"Qin Ruoyu","year":"2024","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2024. Mooncake: Kimi's KVCache-centric Architecture for LLM Serving. arXiv preprint arXiv:2407.00079 (2024)."},{"key":"e_1_3_2_1_21_1","volume-title":"GeminiFS: A Companion File System for GPUs. In 23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Qiu Shi","year":"2025","unstructured":"Shi Qiu, Weinan Liu, Yifan Hu, Jianqin Yan, Zhirong Shen, Xin Yao, Renhai Chen, Gong Zhang, and Yiming Zhang. 2025. GeminiFS: A Companion File System for GPUs. In 23rd USENIX Conference on File and Storage Technologies (FAST 25). USENIX Association, Santa Clara, CA, 221\u2013236. https:\/\/www.usenix.org\/conference\/fast25\/presentation\/qiu"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575748"},{"key":"e_1_3_2_1_23_1","unstructured":"SGLang. 2024. SGLang. https:\/\/github.com\/sgl-project\/sglang?tab=readme-ov-file."},{"key":"e_1_3_2_1_24_1","unstructured":"solidigm. 2024. Solidigm D7-PS1010. https:\/\/www.solidigm.com\/products\/data-center\/d7\/ps1010.html."},{"key":"e_1_3_2_1_25_1","volume-title":"2023 International Conference on Emerging Research in Computational Science (ICERCS). IEEE, 1\u20136.","author":"Thakkar Hiren","year":"2023","unstructured":"Hiren Thakkar and A Manimaran. 2023. Comprehensive Examination of Instruction-Based Language Models: A Comparative Analysis of Mistral-7B and Llama-2-7B. In 2023 International Conference on Emerging Research in Computational Science (ICERCS). IEEE, 1\u20136."},{"key":"e_1_3_2_1_26_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_27_1","unstructured":"vllm. 2025. vLLM V1 - Default max CUDA graph size. https:\/\/discuss.vllm.ai\/t\/vllm-v1-default-max-cuda-graph-size\/357\/."},{"key":"e_1_3_2_1_28_1","unstructured":"vllm. 2025. vLLM V1: A Major Upgrade to vLLM's Core Architecture. https:\/\/blog.vllm.ai\/2025\/01\/27\/v1-alpha-release.html."},{"key":"e_1_3_2_1_29_1","unstructured":"vllm. 2025. vLLM's torch.compile integration. https:\/\/github.com\/vllm-project\/vllm\/blob\/main\/docs\/source\/design\/v1\/torch_compile.md\/."},{"key":"e_1_3_2_1_30_1","unstructured":"Devavret Makkar Vukasin Milovanovic and Gregory Kimball. 2022. Boosting Data Ingest Throughput with GPUDirect Storage and RAPIDS cuDF. https:\/\/developer.nvidia.com\/zh-cn\/blog\/boosting-data-ingest-throughput-with-gpudirect-storage-and-rapids-cudf\/."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696098"},{"key":"e_1_3_2_1_32_1","volume-title":"A survey on recent advances in llm-based multi-turn dialogue systems. arXiv preprint arXiv:2402.18013","author":"Yi Zihao","year":"2024","unstructured":"Zihao Yi, Jiarui Ouyang, Yuwen Liu, Tianhao Liao, Zhe Xu, and Ying Shen. 2024. A survey on recent advances in llm-based multi-turn dialogue systems. arXiv preprint arXiv:2402.18013 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"Yi: Open foundation models by 01. ai. arXiv preprint arXiv:2403.04652","author":"Young Alex","year":"2024","unstructured":"Alex Young, Bei Chen, Chao Li, Chengen Huang, Ge Zhang, Guanwei Zhang, Guoyin Wang, Heng Li, Jiangcheng Zhu, Jianqun Chen, et al. 2024. Yi: Open foundation models by 01. ai. arXiv preprint arXiv:2403.04652 (2024)."},{"key":"e_1_3_2_1_34_1","unstructured":"Tao Yuan Xuefei Ning Dong Zhou Zhijie Yang Shiyao Li Minghui Zhuang Zheyue Tan Zhuyu Yao Dahua Lin Boxun Li Guohao Dai Shengen Yan and Yu Wang. 2024. LV-Eval: A Balanced Long-Context Benchmark with 5 Length Levels Up to 256K. arXiv:2402.05136 [cs.CL]"}],"event":{"name":"APSys '25: 16th ACM SIGOPS Asia-Pacific Workshop on Systems","location":"Lotte Hotel World, Emerald Hall Seoul Republic of Korea","acronym":"APSys '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 16th ACM SIGOPS Asia-Pacific Workshop on Systems"],"original-title":[],"deposited":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T17:51:42Z","timestamp":1760032302000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725783.3764393"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,11]]},"references-count":34,"alternative-id":["10.1145\/3725783.3764393","10.1145\/3725783"],"URL":"https:\/\/doi.org\/10.1145\/3725783.3764393","relation":{},"subject":[],"published":{"date-parts":[[2025,10,11]]},"assertion":[{"value":"2025-10-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}