{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T07:22:06Z","timestamp":1772695326913,"version":"3.50.1"},"reference-count":104,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100014188","name":"Korea government","doi-asserted-by":"publisher","award":["RS-202400438851"],"award-info":[{"award-number":["RS-202400438851"]}],"id":[{"id":"10.13039\/501100014188","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100004358","name":"Samsung Research Funding Center of Samsung Electronics","doi-asserted-by":"publisher","award":["SRFC-IT2402-03"],"award-info":[{"award-number":["SRFC-IT2402-03"]}],"id":[{"id":"10.13039\/100004358","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,31]]},"DOI":"10.1109\/hpca68181.2026.11408569","type":"proceedings-article","created":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T20:47:22Z","timestamp":1772657242000},"page":"1-16","source":"Crossref","is-referenced-by-count":0,"title":["The Cost of Dynamic Reasoning: Demystifying AI Agents and Test-Time Scaling from an AI Infrastructure Perspective"],"prefix":"10.1109","author":[{"given":"Jiin","family":"Kim","sequence":"first","affiliation":[{"name":"KAIST,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Byeongjun","family":"Shin","sequence":"additional","affiliation":[{"name":"KAIST,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinha","family":"Chung","sequence":"additional","affiliation":[{"name":"KAIST,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minsoo","family":"Rhu","sequence":"additional","affiliation":[{"name":"KAIST,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Keyformer: Kv cache reduction through key tokens selection for efficient generative inference","volume-title":"Proceedings of Machine Learning and Systems","author":"Adnan","year":"2024"},{"key":"ref2","article-title":"Taming Throughput-Latency tradeoff in LLM inference with Sarathi-Serve","volume-title":"Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Agrawal","year":"2024"},{"key":"ref3","year":"2025","journal-title":"Kimi k2: Open agentic intelligence"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"ref5","year":"2024","journal-title":"Model Context Protocol (MCP)"},{"key":"ref6","article-title":"Small language models are the future of agentic ai","author":"Belcak","year":"2025","journal-title":"arXiv preprint"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29720"},{"key":"ref8","article-title":"Medusa: Simple 11 m inference acceleration framework with multiple decoding heads","volume-title":"Proceedings of the 41st International Conference on Machine Learning, ser. ICML\u201924","author":"Cai","year":"2024"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3713082.3730377"},{"key":"ref10","article-title":"Evaluating Large Language Models Trained on Code","author":"Chen","year":"2021","journal-title":"in arxivorg"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-75538-8_7"},{"key":"ref12","doi-asserted-by":"crossref","DOI":"10.1016\/j.joule.2023.09.004","article-title":"The growing energy footprint of artificial intelligence","author":"de Vries","year":"2023","journal-title":"Joule"},{"key":"ref13","author":"DeepSeek-AI","year":"2025","journal-title":"Deepseek-r1: Incentivizing reasoning capability in 11 ms via reinforcement learning"},{"key":"ref14","year":"2025","journal-title":"Data Center Capex to Surpass $ 1 Trillion by 2029, According to Dell\u2019Oro Group"},{"key":"ref15","year":"2025","journal-title":"Chatgpt statistics and facts (2024\u20132025)"},{"key":"ref16","article-title":"Best-route: Adaptive 11 m routing with test-time optimal compute","author":"Ding","year":"2025","journal-title":"arXiv preprint"},{"issue":"120","key":"ref17","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"Journal of Machine Learning Research"},{"key":"ref18","author":"Fello","year":"2024","journal-title":"Gemini 1.5 pro: All you need to know about this near-perfect ai model"},{"key":"ref19","article-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers","author":"Frantar","year":"2022","journal-title":"arXiv preprint"},{"key":"ref20","first-page":"111","article-title":"Cost-Efficient large language model serving for multi-turn conversations with CachedAttention","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Gao"},{"key":"ref21","year":"2025","journal-title":"Announcing the Agent2Agent Protocol (A2A)"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695954"},{"key":"ref23","article-title":"When attention sink emerges in language models: An empirical view","author":"Gu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref24","article-title":"Deepseek-r1: Incentivizing reasoning capability in 11 ms via reinforcement learning","author":"Guo","year":"2025","journal-title":"arXiv preprint"},{"key":"ref25","article-title":"Measuring Mathematical Problem Solving With the MATH Dataset","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)","author":"Hendrycks","year":"2021"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.4140\/TCP.n.2015.249"},{"key":"ref27","year":"2024","journal-title":"What is a hyperscale data center?"},{"key":"ref28","author":"Week","year":"2024","journal-title":"The Success of US Chip Manufacturing Hinges on Our Electric Grid"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716245"},{"key":"ref30","author":"Kim","year":"2023","journal-title":"LLMCompiler: An LLM Compiler for Parallel Function Calling"},{"key":"ref31","article-title":"An LLM Compiler for Parallel Function Calling","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Kim","year":"2024"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref33","article-title":"InfiniGen: Efficient generative inference of large language models with dynamic KV cache management","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Lee","year":"2024"},{"key":"ref34","article-title":"Fast inference from transformers via speculative decoding","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Leviathan","year":"2023"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607034"},{"key":"ref36","article-title":"CAMEL: Communicative agents for \u201cmind\u201d exploration of large language model society","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)","author":"Li","year":"2023"},{"key":"ref37","article-title":"Mixture-of-transformers: A sparse and scalable architecture for multi-modal foundation models","author":"Liang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref38","first-page":"87","article-title":"Awq: Activation-aware weight quantization for on-device llm compression and acceleration","volume-title":"Proceedings of machine learning and systems","volume":"6","author":"Lin","year":"2024"},{"key":"ref39","article-title":"Deepseek-v2: A strong, economical, and efficient mixture-of-experts language model","author":"Liu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.52202\/079017-4443"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672274"},{"key":"ref42","article-title":"Deja vu: contextual sparsity for efficient 11 ms at inference time","volume-title":"Proceedings of the 40th International Conference on Machine Learning, ser. ICML\u201923","author":"Liu","year":"2023"},{"key":"ref43","author":"Luo","year":"2025","journal-title":"Autellix: An efficient serving engine for 11 m agents as general programs"},{"key":"ref44","year":"2025","journal-title":"Llama-3.1\u201370B-Instruct"},{"key":"ref45","first-page":"1","year":"2025","journal-title":"Llama-3.1-8B-Instruct"},{"key":"ref46","year":"2025","journal-title":"Llama 4: Advancing multimodal intelligence at scale"},{"key":"ref47","journal-title":"MLPerf Inference: Datacenter"},{"key":"ref48","author":"Muennighoff","year":"2025","journal-title":"s1: Simple test-time scaling"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1299"},{"key":"ref50","author":"Kumar","year":"2025","journal-title":"How Many Google Searches Per Day [2025 Data]"},{"key":"ref51","journal-title":"NVIDIA DCGM Documentation"},{"key":"ref52","journal-title":"NVIDIA Dynamo Platform"},{"key":"ref53","year":"2022","journal-title":"IntroducingT ChatGP"},{"key":"ref54","year":"2023","journal-title":"OpenAI Function Calling"},{"key":"ref55","year":"2025","journal-title":"Announcing The Stargate Project"},{"key":"ref56","year":"2025","journal-title":"Deep Research FAQ"},{"key":"ref57","year":"2025","journal-title":"Introducing Deep Research"},{"key":"ref58","year":"2025","journal-title":"New funding to build towards AGI"},{"key":"ref59","article-title":"Adaptive 11 m routing under budget constraints","author":"Panda","year":"2025","journal-title":"arXiv preprint"},{"key":"ref60","author":"Paris","year":"2025","journal-title":"Chatgpt hits 1 billion users, openai ceo says, doubled in weeks"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"ref62","author":"Verma","year":"2024","journal-title":"A bottle of water per email: the hidden environmental costs of using AI chatbots"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.378"},{"key":"ref64","article-title":"Optimizing test-time compute via meta reinforcement fine-tuning","author":"Qu","year":"2025","journal-title":"arXiv preprint"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TPWRS.2022.3173250"},{"key":"ref66","year":"2024","journal-title":"Meta to invest $ 10 billion for Louisiana data center"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1145\/3642970.3655844"},{"key":"ref68","author":"City Light","year":"2024","journal-title":"Fingertip Facts"},{"key":"ref69","year":"2023","journal-title":"The Inference Cost Of Search Disruption - Large Language Model Cost Analysis"},{"key":"ref70","author":"Team","year":"2023","journal-title":"Sharegpt"},{"key":"ref71","author":"Shinn","year":"2023","journal-title":"Reflexion: Language Agents with Verbal Reinforcement Learning"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0377"},{"key":"ref73","year":"2025","journal-title":"Chatgpt.com website performance june 2025"},{"key":"ref74","author":"Singh","year":"2025","journal-title":"Meta\u2019s Zuckerberg pledges hundreds of billions for AI data centers in superintelligence push"},{"key":"ref75","author":"Snell","year":"2024","journal-title":"Scaling llm test-time compute optimally can be more effective than scaling model parameters"},{"key":"ref76","article-title":"Scaling llm test-time compute optimally can be more effective than scaling model parameters","year":"2024","journal-title":"arXiv preprint"},{"key":"ref77","article-title":"Llm pruning and distillation in practice: The minitron approach","author":"Sreenivas","year":"2024","journal-title":"arXiv preprint"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716278"},{"key":"ref79","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arxiv.org"},{"key":"ref80","author":"Tran","year":"2025","journal-title":"Semiconductor Manufacturing Energy Consumption: How Green Is the Chip Industry?"},{"key":"ref81","year":"2024","journal-title":"Electricity explained Electricity generation, capacity, and sales in the United States"},{"key":"ref82","journal-title":"vLLM Documentation"},{"key":"ref83","article-title":"Self-Consistency Improves Chain of Thought Reasoning in Language Models","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Wang","year":"2023"},{"key":"ref84","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume-title":"in Proceedings of the 36th International Conference on Neural Information Processing Systems, ser. NIPS \u201922","author":"Wei"},{"key":"ref85","year":"2025","journal-title":"Wikipedia api"},{"key":"ref86","author":"Alpha LLC","year":"2025","journal-title":"Wolfram alpha apis"},{"key":"ref87","article-title":"Autogen: Enabling next-gen LLM applications via multiagent conversations","volume-title":"First Conference on Language Modeling (CoLM)","author":"Wu","year":"2024"},{"key":"ref88","year":"2025","journal-title":"Colossus - xAI"},{"key":"ref89","first-page":"38 087","article-title":"Smoothquant: Accurate and efficient post-training quantization for large language models","volume-title":"International conference on machine learning. PMLR","author":"Xiao"},{"key":"ref90","article-title":"AI Metropolis: Scaling Large Language Model-Based Multi-Agent Simulation with Out-of-Order Execution","author":"Xie","year":"2024","journal-title":"arxiv.org"},{"key":"ref91","article-title":"Towards thinking-optimal scaling of test-time compute for 11 m reasoning","author":"Yang","year":"2025","journal-title":"arXiv preprint"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1259"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696098"},{"key":"ref94","article-title":"WebShop: Towards Scalable Real-World Web Interaction with Grounded Language Agents","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)","author":"Yao","year":"2022"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0517"},{"key":"ref96","article-title":"React: Synergizing Reasoning and Acting in Language Models","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Yao","year":"2023"},{"key":"ref97","year":"2023","journal-title":"ReAct: Synergizing Reasoning and Acting in Language Models"},{"key":"ref98","article-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models","volume-title":"Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Yu","year":"2022"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1126"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2000"},{"key":"ref101","article-title":"DistServe: Disaggregating prefill and decoding for goodput-optimized large language model serving","volume-title":"Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Zhong","year":"2024"},{"key":"ref102","article-title":"Language Agent Tree Search Unifies Reasoning, Acting, and Planning in Language Models","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Zhou","year":"2024"},{"key":"ref103","author":"Zhou","year":"2024","journal-title":"Official Repo of Language Agent Tree Search (LATS)"},{"key":"ref104","article-title":"Least-to-Most Prompting Enables Complex Reasoning in Large Language Models","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Zhou","year":"2023"}],"event":{"name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","location":"Sydney, Australia","start":{"date-parts":[[2026,1,31]]},"end":{"date-parts":[[2026,2,4]]}},"container-title":["2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11408404\/11408433\/11408569.pdf?arnumber=11408569","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T06:51:09Z","timestamp":1772693469000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11408569\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":104,"URL":"https:\/\/doi.org\/10.1109\/hpca68181.2026.11408569","relation":{},"subject":[],"published":{"date-parts":[[2026,1,31]]}}}