{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T11:32:33Z","timestamp":1777462353539,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":78,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3805621.3807623","type":"proceedings-article","created":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:08:45Z","timestamp":1777381725000},"page":"286-295","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["A Case for a Simulation-Driven Exploration of Distributed GenAI Platforms"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3586-7168","authenticated-orcid":false,"given":"Animesh","family":"Trivedi","sequence":"first","affiliation":[{"name":"IBM Research, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8089-866X","authenticated-orcid":false,"given":"Radu","family":"Stoica","sequence":"additional","affiliation":[{"name":"IBM Research, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6084-4251","authenticated-orcid":false,"given":"Jeremy","family":"Cohn","sequence":"additional","affiliation":[{"name":"IBM Research, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0614-6543","authenticated-orcid":false,"given":"Danny","family":"Harnik","sequence":"additional","affiliation":[{"name":"IBM Research, Haifa, Israel"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2207-1904","authenticated-orcid":false,"given":"Yue","family":"Zhu","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9694-5063","authenticated-orcid":false,"given":"Jonathan","family":"Terner","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3013-6689","authenticated-orcid":false,"given":"Guy","family":"Margalit","sequence":"additional","affiliation":[{"name":"IBM Storage, Givatayim, Israel"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5691-0284","authenticated-orcid":false,"given":"Frank","family":"Schmuck","sequence":"additional","affiliation":[{"name":"IBM Research, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1424-9977","authenticated-orcid":false,"given":"Vasily","family":"Tarasov","sequence":"additional","affiliation":[{"name":"IBM Research, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4468-3061","authenticated-orcid":false,"given":"Swaminathan","family":"Sundararaman","sequence":"additional","affiliation":[{"name":"IBM Research, San Jose, USA"}]}],"member":"320","published-online":{"date-parts":[[2026,4,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2026. Understand LLM Batch Inference Basics. https:\/\/docs.anyscale. com\/llm\/batch-inference\/llm-batch-inference-basics. Anyscale Documentation. Accessed: 2026-02-16."},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of Machine Learning and Systems (MLSys)","volume":"6","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Jayashree Mohan, Ashish Panwar, Nipun Kwatra, Bhargav S. Gulavani, Ramachandran Ramjee, and Alexey Tumanov. 2024. Vidur: A Large-Scale Simulation Framework for LLM Inference. In Proceedings of Machine Learning and Systems (MLSys), Vol. 6. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2024\/hash\/b74a8de47d2b3c928360e0a011f48351-Abstract-Conference.html"},{"key":"e_1_3_2_1_3_1","unstructured":"Alimama AI Infra Team and Future Living Lab Alibaba Group. 2025. InferSim: A Lightweight LLM Inference Performance Simulator. https:\/\/github.com\/alibaba\/InferSim."},{"key":"e_1_3_2_1_4_1","volume-title":"Publication date","author":"Services Amazon Web","year":"2024","unstructured":"Amazon Web Services. 2024. AWS Well-Architected Framework. Amazon Web Services. https:\/\/docs.aws.amazon.com\/wellarchitected\/latest\/framework\/welcome.html Documentation, Publication date: November 6, 2024."},{"key":"e_1_3_2_1_5_1","unstructured":"Anthropic Engineering Team. 2025. Effective context engineering for AI agents. https:\/\/www.anthropic.com\/engineering\/effective-context-engineering-for-ai-agents. Engineering blog post."},{"key":"e_1_3_2_1_6_1","unstructured":"Beam Cloud. 2025. Serverless GPUs for AI Inference and Training. Blog post on Beam Cloud. https:\/\/www.beam.cloud\/blog\/serverless-gpu Accessed: 2026-02-23."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.14778\/3773749.3773758"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3704262"},{"key":"e_1_3_2_1_9_1","unstructured":"Audrey Cheng Shu Liu Melissa Pan Zhifei Li Shubham Agarwal Mert Cemri Bowen Wang Alexander Krentsel Tian Xia Jongseok Park Shuo Yang Jeff Chen Lakshya Agrawal Ashwin Naren Shulu Li Ruiying Ma Aditya Desai Jiarong Xing Koushik Sen Matei Zaharia and Ion Stoica. 2025. Let the Barbarians In: How AI Can Accelerate Systems Performance Research. arXiv:2512.14806 [cs.SE] https:\/\/arxiv.org\/abs\/2512.14806"},{"key":"e_1_3_2_1_10_1","unstructured":"Audrey Cheng Shu Liu Melissa Pan Zhifei Li Bowen Wang Alex Krentsel Tian Xia Mert Cemri Jongseok Park Shuo Yang Jeff Chen Lakshya Agrawal Aditya Desai Jiarong Xing Koushik Sen Matei Zaharia and Ion Stoica. 2025. Barbarians at the Gate: How AI is Upending Systems Research. arXiv:2510.06189 [cs.AI] https:\/\/arxiv.org\/abs\/2510.06189"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2025.3628325"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC63097.2024.00012"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3731569.3764834"},{"key":"e_1_3_2_1_14_1","unstructured":"Amr Elmeleegy Harry Kim David Zier Kyle Kranen Neelay Shah Ryan Olson and Omri Kahalon. 2025. NVIDIA Dynamo A Low-Latency Distributed Inference Framework for Scaling Reasoning AI Models. NVIDIA Technical Blog. Available online: https:\/\/developer.nvidia.com\/blog\/introducing-nvidia-dynamo-a-low-latency-distributed-inference-framework-for-scaling-reasoning-ai-models\/."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI '24)","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. 2024. ServerlessLLM: Low-Latency Serverless Inference for Large Language Models. In Proceedings of the 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI '24). USENIX Association, Santa Clara, CA, USA, 135\u2013153. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/fu"},{"key":"e_1_3_2_1_16_1","volume-title":"Cost-Efficient Large Language Model Serving for Multi-turn Conversations with CachedAttention. In 2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Gao Bin","year":"2024","unstructured":"Bin Gao, Zhuomin He, Puru Sharma, Qingxuan Kang, Djordje Jevdjic, Junbo Deng, Xingkun Yang, Zhou Yu, and Pengfei Zuo. 2024. Cost-Efficient Large Language Model Serving for Multi-turn Conversations with CachedAttention. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). USENIX Association, Santa Clara, CA, 111\u2013126. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/gao-bin-cost"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3725394"},{"key":"e_1_3_2_1_18_1","volume-title":"Last updated","author":"Cloud Google","year":"2026","unstructured":"Google Cloud. 2026. About the Well-Architected Framework. Google LLC. https:\/\/docs.cloud.google.com\/docs\/get-started\/well-architected-framework Google Cloud Well-Architected Framework documentation, Last updated February 23, 2026."},{"key":"e_1_3_2_1_19_1","volume-title":"Accelerating Design Space Exploration for LLM Training Systems with Multi-experiment Parallel Simulation. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Gui Fei","year":"2025","unstructured":"Fei Gui, Kaihui Gao, Li Chen, Dan Li, Vincent Liu, Ran Zhang, Hongbing Yang, and Dian Xiong. 2025. Accelerating Design Space Exploration for LLM Training Systems with Multi-experiment Parallel Simulation. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25). USENIX Association, Philadelphia, PA, 473\u2013488. https:\/\/www.usenix.org\/conference\/nsdi25\/presentation\/gui"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the 38th International Conference on Neural Information Processing Systems","author":"He Yefei","year":"2024","unstructured":"Yefei He, Luoming Zhang, Weijia Wu, Jing Liu, Hong Zhou, and Bohan Zhuang. 2024. ZipCache: accurate and efficient KV cache quantization with salient token identification. In Proceedings of the 38th International Conference on Neural Information Processing Systems (Vancouver, BC, Canada) (NIPS '24). Curran Associates Inc., Red Hook, NY, USA, Article 2181, 21 pages."},{"key":"e_1_3_2_1_21_1","volume-title":"EPIC: Efficient Position-Independent Caching for Serving Large Language Models. In Forty-second International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=qjd3ZUiHRT","author":"Hu Junhao","year":"2025","unstructured":"Junhao Hu, Wenrui Huang, Weidong Wang, Haoyi Wang, tiancheng hu, zhang qin, Hao Feng, Xusheng Chen, Yizhou Shan, and Tao Xie. 2025. EPIC: Efficient Position-Independent Caching for Serving Large Language Models. In Forty-second International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=qjd3ZUiHRT"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the 2025 USENIX Annual Technical Conference (USENIX ATC'25). USENIX Association","author":"Hu Junhao","year":"2025","unstructured":"Junhao Hu, Jiang Xu, Zhixia Liu, Yulong He, Yuetao Chen, Hao Xu, Jiang Liu, Jie Meng, Baoquan Zhang, Shining Wan, Gengyuan Dan, Zhiyu Dong, Zhihao Ren, Changhong Liu, Tao Xie, Dayun Lin, Qin Zhang, Yue Yu, Hao Feng, Xusheng Chen, and Yizhou Shan. 2025. DEEPSERVE: Serverless Large Language Model Serving at Scale. In Proceedings of the 2025 USENIX Annual Technical Conference (USENIX ATC'25). USENIX Association, Boston, MA, USA, 57\u201372. https:\/\/www.usenix.org\/conference\/atc25\/presentation\/hu-junhao"},{"key":"e_1_3_2_1_23_1","unstructured":"inference-sim contributors. 2026. Blackbox Inference Simulator (BLIS). https:\/\/github.com\/inference-sim\/inference-sim. GitHub repository."},{"key":"e_1_3_2_1_24_1","unstructured":"InfraWhisperer. 2026. RouteSim: An Inference Load Balancer Simulator & Benchmarking Framework. https:\/\/github.com\/InfraWhisperer\/RouteSim. GitHub repository accessed: 2026-02-17."},{"key":"e_1_3_2_1_25_1","volume-title":"ACM Sigmetrics","author":"Jain Kunal","year":"2026","unstructured":"Kunal Jain, A. Parayil, Ankur Mallick, Rujia Wang, Renee St. Amant, Chetan Bansal, Victor Ruehle, Saravan Rajmohan, Shashwat Jaiswal, Yogesh Simmhan, Anoop Kulkarni, and Steve Kofsky. 2026. Serving Models, Fast and Slow: Optimizing Heterogeneous LLM Inferencing Workloads at Scale. In ACM Sigmetrics 2026. https:\/\/www.microsoft.com\/en-us\/research\/publication\/serving-models-fast-and-slowoptimizing-heterogeneous-llm-inferencing-workloads-at-scale\/"},{"key":"e_1_3_2_1_26_1","unstructured":"Inho Jeong Sunghyeon Woo Sol Namkung and Dongsuk Jeon. 2025. HiFC: High-efficiency Flash-based KV Cache Swapping for Scaling LLM Inference. In The Thirty-ninth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=onhjdWCxZY"},{"key":"e_1_3_2_1_27_1","volume-title":"Forty-second International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=WOyOtaO6lQ","author":"Jin Shuowei","year":"2025","unstructured":"Shuowei Jin, Xueshen Liu, Qingzhao Zhang, and Zhuoqing Mao. 2025. Compute or Load KV Cache? Why Not Both?. In Forty-second International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=WOyOtaO6lQ"},{"key":"e_1_3_2_1_28_1","unstructured":"Huzaifa Shaaban Kabakibo Animesh Trivedi and Lin Wang. 2026. Breaking the Ice: Analyzing Cold Start Latency in vLLM. In (to appear) in the Ninth Conference on Machine Learning and Systems (MLSys'26)."},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the 15th Annual Conference on Innovative Data Systems Research (CIDR '25)","author":"Kayali Moe","year":"2025","unstructured":"Moe Kayali, Fabian Wenz, Nesime Tatbul, and \u00c7a\u011fatay Demiralp. 2025. Mind the Data Gap: Bridging LLMs to Enterprise Data Integration. In Proceedings of the 15th Annual Conference on Innovative Data Systems Research (CIDR '25). Amsterdam, Netherlands. https:\/\/vldb.org\/cidrdb\/papers\/2025\/p34-kayali.pdf Available at https:\/\/arxiv.org\/abs\/2412.20331."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2025.3596539"},{"key":"e_1_3_2_1_31_1","volume-title":"A Survey on Large Language Model Acceleration based on KV Cache Management. Transactions on Machine Learning Research","author":"Yiming Li Haoyang LI","year":"2025","unstructured":"Haoyang LI, Yiming Li, Anxin Tian, Tianhao Tang, Zhanchao Xu, Xuejia Chen, Nicole HU, Wei Dong, Li Qing, and Lei Chen. 2025. A Survey on Large Language Model Acceleration based on KV Cache Management. Transactions on Machine Learning Research (2025). https:\/\/openreview.net\/forum?id=z3JZzu9EA3"},{"key":"e_1_3_2_1_32_1","volume-title":"Continuum: Efficient and Robust Multi-Turn LLM Agent Scheduling with KV Cache Time-to-Live. arXiv:2511.02230 [cs.OS] https:\/\/arxiv.org\/abs\/2511.02230","author":"Li Hanchen","year":"2026","unstructured":"Hanchen Li, Qiuyang Mang, Runyuan He, Qizheng Zhang, Huanzhi Mao, Xiaokun Chen, Hangrui Zhou, Alvin Cheung, Joseph Gonzalez, and Ion Stoica. 2026. Continuum: Efficient and Robust Multi-Turn LLM Agent Scheduling with KV Cache Time-to-Live. arXiv:2511.02230 [cs.OS] https:\/\/arxiv.org\/abs\/2511.02230"},{"key":"e_1_3_2_1_33_1","unstructured":"Rongzhi Li Ruogu Du Zefang Chu Sida Zhao Chunlei Han Zuocheng Shi Yiwen Shao Huanle Han Long Huang Zherui Liu and Shufan Liu. 2025. Taming the Chaos: Coordinated Autoscaling for Heterogeneous and Disaggregated LLM Inference. arXiv:2508.19559 [cs.DC] https:\/\/arxiv.org\/abs\/2508.19559"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3749168"},{"key":"e_1_3_2_1_35_1","volume-title":"APEX: An Extensible and Dynamism-Aware Simulator for Automated Parallel Execution in LLM Serving. arXiv:2411.17651 [cs.DC] https:\/\/arxiv.org\/abs\/2411.17651","author":"Lin Yi-Chien","year":"2024","unstructured":"Yi-Chien Lin, Woosuk Kwon, Ronald Pineda, and Fanny Nina Paravecino. 2024. APEX: An Extensible and Dynamism-Aware Simulator for Automated Parallel Execution in LLM Serving. arXiv:2411.17651 [cs.DC] https:\/\/arxiv.org\/abs\/2411.17651"},{"key":"e_1_3_2_1_36_1","volume-title":"Parameswaran","author":"Liu Shu","year":"2025","unstructured":"Shu Liu, Soujanya Ponnapalli, Shreya Shankar, Sepanta Zeighami, Alan Zhu, Shubham Agarwal, Ruiqi Chen, Samion Suwito, Shuo Yuan, Ion Stoica, Matei Zaharia, Alvin Cheung, Natacha Crooks, Joseph E. Gonzalez, and Aditya G. Parameswaran. 2025. Supporting Our AI Overlords: Redesigning Data Systems to be Agent-First. arXiv:2509.00997 [cs.AI] https:\/\/arxiv.org\/abs\/2509.00997"},{"key":"e_1_3_2_1_37_1","unstructured":"Yuhan Liu Yihua Cheng Jiayi Yao Yuwei An Xiaokun Chen Shaoting Feng Yuyang Huang Samuel Shen Rui Zhang Kuntai Du and Junchen Jiang. 2025. LMCache: An Efficient KV Cache Layer for Enterprise-Scale LLM Inference. arXiv:2510.09665 [cs.LG] https:\/\/arxiv.org\/abs\/2510.09665"},{"key":"e_1_3_2_1_38_1","unstructured":"Yuhan Liu Yuyang Huang Jiayi Yao Shaoting Feng Zhuohan Gu Kuntai Du Hanchen Li Yihua Cheng Junchen Jiang Shan Lu Madan Musuvathi and Esha Choukse. 2025. DroidSpeak: KV Cache Sharing for Cross-LLM Communication and Multi-LLM Serving. arXiv:2411.02820 [cs.MA] https:\/\/arxiv.org\/abs\/2411.02820"},{"key":"e_1_3_2_1_39_1","unstructured":"llm-d. 2025. Accelerator Simulation. https:\/\/llm-d.ai\/docs\/guide\/Installation\/simulated-accelerators Accessed: 2026-02-06."},{"key":"e_1_3_2_1_40_1","unstructured":"llm-d. 2026. Architecture Documentation. https:\/\/llm-d.ai\/docs\/architecture"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3754448"},{"key":"e_1_3_2_1_42_1","volume-title":"Azure Well-Architected Framework","unstructured":"Microsoft. 2026. Azure Well-Architected Framework. Microsoft Corporation. https:\/\/learn.microsoft.com\/en-us\/azure\/well-architected\/ Microsoft Learn documentation."},{"key":"e_1_3_2_1_43_1","unstructured":"MoonshotAI. 2026. MoonshotAI Inference Stack: Scalable LLM Production Infrastructure. https:\/\/github.com\/MoonshotAI."},{"key":"e_1_3_2_1_44_1","volume-title":"How and When the Memory Chip Shortage Will End","author":"Moore Samuel K.","unstructured":"Samuel K. Moore. 2026. How and When the Memory Chip Shortage Will End. IEEE Spectrum. https:\/\/spectrum.ieee.org\/dram-shortage"},{"key":"e_1_3_2_1_45_1","unstructured":"Ella Neiman. 2026. Go Big or Go OOM: The Art of Scaling vLLM. https:\/\/www.ai21.com\/blog\/scaling-vllm-without-oom\/. AI21 Labs blog."},{"key":"e_1_3_2_1_46_1","unstructured":"NVIDIA Corporation. 2026. Planner Introduction. NVIDIA. https:\/\/docs.nvidia.com\/dynamo\/latest\/planner\/planner_intro.html NVIDIA Dynamo Documentation."},{"key":"e_1_3_2_1_47_1","volume-title":"The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=8sSqNntaMr","author":"Ong Isaac","year":"2025","unstructured":"Isaac Ong, Amjad Almahairi, Vincent Wu, Wei-Lin Chiang, Tianhao Wu, Joseph E. Gonzalez, M Waleed Kadous, and Ion Stoica. 2025. RouteLLM: Learning to Route LLMs from Preference Data. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=8sSqNntaMr"},{"key":"e_1_3_2_1_48_1","volume-title":"Emmanuele Lacavalla, Alessandro Basile, Shuyi Yang, Paul Castro, Daniel Kang, Joseph E. Gonzalez, Koushik Sen, Dawn Song, Ion Stoica, Matei Zaharia, and Marquita Ellis.","author":"Pan Melissa Z.","year":"2026","unstructured":"Melissa Z. Pan, Negar Arabzadeh, Riccardo Cogo, Yuxuan Zhu, Alexander Xiong, Lakshya A Agrawal, Huanzhi Mao, Emma Shen, Sid Pallerla, Liana Patel, Shu Liu, Tianneng Shi, Xiaoyuan Liu, Jared Quincy Davis, Emmanuele Lacavalla, Alessandro Basile, Shuyi Yang, Paul Castro, Daniel Kang, Joseph E. Gonzalez, Koushik Sen, Dawn Song, Ion Stoica, Matei Zaharia, and Marquita Ellis. 2026. Measuring Agents in Production. arXiv:2512.04123 [cs.CY] https:\/\/arxiv.org\/abs\/2512.04123"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_50_1","volume-title":"23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Qin Ruoyu","year":"2025","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Jialei Cui, Feng Ren, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2025. Mooncake: Trading More Storage for Less Computation \u2014 A KVCache-centric Architecture for Serving LLM Chatbot. In 23rd USENIX Conference on File and Storage Technologies (FAST 25). USENIX Association. https:\/\/www.usenix.org\/conference\/fast25\/presentation\/qin"},{"key":"e_1_3_2_1_51_1","unstructured":"reDB \/ infermesh contributors. 2025. InferMesh Simulator Documentation. GitHub. https:\/\/github.com\/redbco\/infermesh\/blob\/main\/docs\/SIMULATOR.md"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3719330.3721230"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2019.8916327"},{"key":"e_1_3_2_1_54_1","volume-title":"Cost-Effective Large Language Model Inference Infrastructure. arXiv preprint arXiv:2504.03648","author":"Shan Jiaxin","year":"2025","unstructured":"Jiaxin Shan, Varun Gupta, Le Xu, Haiyang Shi, Jingyuan Zhang, Ning Wang, Liguang Xie, and The AIBrix Team. 2025. AIBrix: Towards Scalable, Cost-Effective Large Language Model Inference Infrastructure. arXiv preprint arXiv:2504.03648 (2025). https:\/\/arxiv.org\/abs\/2504 03648"},{"key":"e_1_3_2_1_55_1","volume-title":"ATLAHS: An Application-centric Network Simulator Toolchain for AI, HPC, and Distributed Storage. arXiv:2505.08936 [cs.DC] https:\/\/arxiv.org\/abs\/2505.08936","author":"Shen Siyuan","year":"2025","unstructured":"Siyuan Shen, Tommaso Bonato, Zhiyi Hu, Pasquale Jordan, Tiancheng Chen, and Torsten Hoefler. 2025. ATLAHS: An Application-centric Network Simulator Toolchain for AI, HPC, and Distributed Storage. arXiv:2505.08936 [cs.DC] https:\/\/arxiv.org\/abs\/2505.08936"},{"key":"e_1_3_2_1_56_1","unstructured":"SimPy Development Team. 2026. SimPy 4.1 Documentation. https:\/\/simpy.readthedocs.io\/."},{"key":"e_1_3_2_1_57_1","volume-title":"MQSim: A Framework for Enabling Realistic Studies of Modern Multi-Queue SSD Devices. In 16th USENIX Conference on File and Storage Technologies (FAST 18)","author":"Tavakkol Arash","year":"2018","unstructured":"Arash Tavakkol, Juan G\u00f3mez-Luna, Mohammad Sadrosadati, Saugata Ghose, and Onur Mutlu. 2018. MQSim: A Framework for Enabling Realistic Studies of Modern Multi-Queue SSD Devices. In 16th USENIX Conference on File and Storage Technologies (FAST 18). USENIX Association, Oakland, CA, 49\u201366. https:\/\/www.usenix.org\/conference\/fast18\/presentation\/tavakkol"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3655038.3665954"},{"key":"e_1_3_2_1_59_1","unstructured":"Tom's Hardware. 2025. AI Datacenter Growth Drives Memory and SSD Cost Increases. Tom's Hardware technical article. https:\/\/www.tomshardware.com\/pc-components\/storage\/perfect-storm-of-demand-and-supply-driving-up-storage-costs"},{"key":"e_1_3_2_1_60_1","volume-title":"DeepSeek Shows the Limits of U.S. Export Controls on AI Chips. Brookings Institution (Jan 29","author":"Villasenor John","year":"2025","unstructured":"John Villasenor. 2025. DeepSeek Shows the Limits of U.S. Export Controls on AI Chips. Brookings Institution (Jan 29 2025). https:\/\/www.brookings.edu\/articles\/deepseek-shows-the-limits-of-us-export-controls-on-ai-chips\/"},{"key":"e_1_3_2_1_61_1","unstructured":"vLLM Contributors. 2026. vLLM API Reference: vllm.distributed.kv_events. https:\/\/docs.vllm.ai\/en\/latest\/api\/vllm\/distributed\/kv_events\/. Accessed: 2026-02-18."},{"key":"e_1_3_2_1_62_1","unstructured":"vllm-project. 2024. GuideLLM: Evaluate and Enhance Your LLM Deployments for Real-World Inference Needs. https:\/\/github.com\/vllm-project\/guidellm. Accessed: 2026-02-06."},{"key":"e_1_3_2_1_63_1","unstructured":"vLLM Project. 2025. vLLM Production Stack: Reference Stack for Production-Scale vLLM Deployments. vLLM Technical Blog. Available online: https:\/\/blog.vllm.ai\/2025\/01\/21\/stack-release.html."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3616855.3635825"},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of the 2025 USENIX Conference on Usenix Annual Technical Conference (Boston, MA, USA) (USENIX ATC '25). USENIX Association, USA, Article 28","author":"Wang Jiahao","year":"2025","unstructured":"Jiahao Wang, Jinbo Han, Xingda Wei, Sijie Shen, Dingyan Zhang, Chenguang Fang, Rong Chen, Wenyuan Yu, and Haibo Chen. 2025. KVCache cache in the wild: characterizing and optimizing KVCache cache at a large cloud provider. In Proceedings of the 2025 USENIX Conference on Usenix Annual Technical Conference (Boston, MA, USA) (USENIX ATC '25). USENIX Association, USA, Article 28, 18 pages."},{"key":"e_1_3_2_1_66_1","unstructured":"Wikipedia contributors. 2026. 2024-present Global Memory Supply Shortage. Wikipedia. https:\/\/en.wikipedia.org\/wiki\/2024%E2%80%93present_global_memory_supply_shortage Summary of global memory shortage including DRAM and NAND price inflation driven by AI infrastructure demand."},{"key":"e_1_3_2_1_67_1","unstructured":"Yuxing Xiang Xue Li Kun Qian Wenyuan Yu Ennan Zhai and Xin Jin. 2025. ServeGen: Workload Characterization and Generation of Large Language Model Serving in Production. arXiv:2505.09999 [cs.DC] https:\/\/arxiv.org\/abs\/2505.09999"},{"key":"e_1_3_2_1_68_1","volume-title":"The Fourteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=3loQDtveWI","author":"Xie Huanyi","year":"2026","unstructured":"Huanyi Xie, Yubin Chen, Liangyu Wang, Lijie Hu, and Di Wang. 2026. Predicting LLM Output Length via Entropy-Guided Representations. In The Fourteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=3loQDtveWI"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2508.18572"},{"key":"e_1_3_2_1_70_1","volume-title":"Proceedings of the 39th Conference on Neural Information Processing Systems (NeurIPS 2025)","author":"Yang Dongsheng","year":"2025","unstructured":"Dongsheng Yang, Austin Li, Kai Li, and Wyatt Lloyd. 2025. Learned Prefix Caching for Efficient LLM Inference. In Proceedings of the 39th Conference on Neural Information Processing Systems (NeurIPS 2025) Poster Session. Poster. Available at https:\/\/openreview.net\/pdf?id=Vj48eXaQDM. Accessed: 2026-02-12."},{"key":"e_1_3_2_1_71_1","volume-title":"Proceedings of the 42nd International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"71209","author":"Yang Shiming","year":"2025","unstructured":"Shiming Yang, Yuxuan Tong, Xinyao Niu, Graham Neubig, and Xiang Yue. 2025. Demystifying Long Chain-of-Thought Reasoning. In Proceedings of the 42nd International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 267), Aarti Singh, Maryam Fazel, Daniel Hsu, Simon Lacoste-Julien, Felix Berkenkamp, Tegan Maharaj, Kiri Wagstaff, and Jerry Zhu (Eds.). PMLR, 71177\u201371209. https:\/\/proceedings.mlr.press\/v267\/yang25ae.html"},{"key":"e_1_3_2_1_72_1","volume-title":"Beluga: A CXL-Based Memory Architecture for Scalable and Efficient LLM KVCache Management. arXiv:2511.20172 [cs.DC] https:\/\/arxiv.org\/abs\/2511.20172","author":"Yang Xinjun","year":"2025","unstructured":"Xinjun Yang, Qingda Hu, Junru Li, Feifei Li, Yicong Zhu, Yuqi Zhou, Qiuru Lin, Jian Dai, Yang Kong, Jiayu Zhang, Guoqiang Xu, and Qiang Liu. 2025. Beluga: A CXL-Based Memory Architecture for Scalable and Efficient LLM KVCache Management. arXiv:2511.20172 [cs.DC] https:\/\/arxiv.org\/abs\/2511.20172"},{"key":"e_1_3_2_1_73_1","unstructured":"Jenny Yi Mark Kurtz and Addie Stevens. 2025. GuideLLM: Evaluate LLM deployments for real-world inference. Red Hat Developer. https:\/\/developers.redhat.com\/articles\/2025\/06\/20\/guidellm-evaluate-llm-deployments-real-world-inference Accessed: 2026-02-06."},{"key":"e_1_3_2_1_74_1","volume-title":"Yan Yan, Beidi Chen, Guangyu Sun, and Kurt Keutzer.","author":"Yuan Zhihang","year":"2024","unstructured":"Zhihang Yuan, Yuzhang Shang, Yang Zhou, Zhen Dong, Zhe Zhou, Chenhao Xue, Bingzhe Wu, Zhikai Li, Qingyi Gu, Yong Jae Lee, Yan Yan, Beidi Chen, Guangyu Sun, and Kurt Keutzer. 2024. LLM Inference Unveiled: Survey and Roofline Model Insights. arXiv:2402.16363 [cs.CL] https:\/\/arxiv.org\/abs\/2402.16363"},{"key":"e_1_3_2_1_75_1","unstructured":"Murong Yue Jie Zhao Min Zhang Liang Du and Ziyu Yao. 2024. Large Language Model Cascades with Mixture of Thoughts Representations for Cost-efficient Reasoning. arXiv:2310.03094 [cs.CL] https:\/\/arxiv.org\/abs\/2310 03094"},{"key":"e_1_3_2_1_76_1","volume-title":"Proceedings of the 19th USENIX Symposium on Operating Systems Design and Implementation (OSDI '25)","author":"Zhang Dingyan","year":"2025","unstructured":"Dingyan Zhang, Haotian Wang, Yang Liu, Xingda Wei, Yizhou Shan, Rong Chen, and Haibo Chen. 2025. BlitzScale: Fast and Live Large Model Autoscaling with O(1) Host Caching. In Proceedings of the 19th USENIX Symposium on Operating Systems Design and Implementation (OSDI '25). USENIX Association, Boston, MA, USA, 275\u2013293. https:\/\/www.usenix.org\/conference\/osdi25\/presentation\/zhang-dingyan"},{"key":"e_1_3_2_1_77_1","volume-title":"Fei Sun, Linsen Ma, Chris J. Newburn, Teresa Zhang, Yang Liu, Jiangpeng Li, Hao Zhong, and Wen-Mei Hwu.","author":"Zhang Tong","year":"2025","unstructured":"Tong Zhang, Vikram Sharma Mailthody, Fei Sun, Linsen Ma, Chris J. Newburn, Teresa Zhang, Yang Liu, Jiangpeng Li, Hao Zhong, and Wen-Mei Hwu. 2025. From Minutes to Seconds: Redefining the Five-Minute Rule for AI-Era Memory Hierarchies. arXiv:2511.03944 [cs.AR] https:\/\/arxiv.org\/abs\/2511.03944"},{"key":"e_1_3_2_1_78_1","volume-title":"Benjamin Genchel, and Amanda Cercas Curry.","author":"Zhao Justin","year":"2025","unstructured":"Justin Zhao, Flor Miriam Plaza del Arco, Benjamin Genchel, and Amanda Cercas Curry. 2025. Language Model Council: Democratically Benchmarking Foundation Models on Highly Subjective Tasks. arXiv:2406.08598 [cs.CL] https:\/\/arxiv.org\/abs\/2406.08598"}],"event":{"name":"EuroSys '26: 21st European Conference on Computer Systems","location":"Edinburgh Scotland Uk","acronym":"EuroMLSys '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Sixth European Workshop on Machine Learning and Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3805621.3807623","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:17:20Z","timestamp":1777382240000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805621.3807623"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,27]]},"references-count":78,"alternative-id":["10.1145\/3805621.3807623","10.1145\/3805621"],"URL":"https:\/\/doi.org\/10.1145\/3805621.3807623","relation":{},"subject":[],"published":{"date-parts":[[2026,4,27]]},"assertion":[{"value":"2026-04-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}