{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T06:05:30Z","timestamp":1779861930046,"version":"3.53.1"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,4,26]],"date-time":"2026-04-26T00:00:00Z","timestamp":1777161600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,26]],"date-time":"2026-04-26T00:00:00Z","timestamp":1777161600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,4,26]]},"DOI":"10.1109\/ispass69572.2026.00014","type":"proceedings-article","created":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T19:39:19Z","timestamp":1779824359000},"page":"28-38","source":"Crossref","is-referenced-by-count":0,"title":["TaxBreak: Unmasking the Hidden Costs of LLM Inference Through Overhead Decomposition"],"prefix":"10.1109","author":[{"given":"Prabhu","family":"Vellaisamy","sequence":"first","affiliation":[{"name":"Carnegie Mellon University,NeuroAI Computer Architecture Lab (NCAL) Electrical and Computer Engineering Department"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shreesh","family":"Tripathi","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,NeuroAI Computer Architecture Lab (NCAL) Electrical and Computer Engineering Department"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Vignesh","family":"Natarajan","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,NeuroAI Computer Architecture Lab (NCAL) Electrical and Computer Engineering Department"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Surya Santhan","family":"Thenarasu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,NeuroAI Computer Architecture Lab (NCAL) Electrical and Computer Engineering Department"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shawn","family":"Blanton","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,NeuroAI Computer Architecture Lab (NCAL) Electrical and Computer Engineering Department"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"John P.","family":"Shen","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,NeuroAI Computer Architecture Lab (NCAL) Electrical and Computer Engineering Department"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gpt-4 technical report","volume-title":"arXiv preprint arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref2","first-page":"351","article-title":"Vidur: A large-scale simulation framework for 11 m inference","volume-title":"Proceedings of Machine Learning and Systems","volume":"6","author":"Agrawal"},{"key":"ref3","first-page":"117","article-title":"Taming Throughput-Latency tradeoff in LLM inference with Sarathi-Serve","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal"},{"key":"ref4","article-title":"Flashdmoe: Fast distributed moe in a single kernel","author":"Aimuyo","year":"2025","journal-title":"arXiv preprint arXiv:2506.04667"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640366"},{"key":"ref6","first-page":"ascl-2111","article-title":"Jax: Autograd and xla","author":"Bradbury","year":"2021","journal-title":"Astrophysics Source Code Library"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS51385.2021.00027"},{"key":"ref8","article-title":"Evaluating large language models trained on code","author":"Chen","year":"2021","journal-title":"arXiv preprint arXiv:2107.03374"},{"key":"ref9","first-page":"578","article-title":"TVM: An automated End-to-End optimizing compiler for deep learning","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3731599.3767706"},{"key":"ref11","article-title":"Flashattention-2: Faster attention with better parallelism and work partitioning","author":"Dao","year":"2023","journal-title":"arXiv preprint arXiv:2307.08691"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1189"},{"key":"ref13","article-title":"Liminal: Exploring the frontiers of 11 m decode performance","author":"Davies","year":"2025","journal-title":"arXiv preprint arXiv:2507.14397"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.98"},{"key":"ref15","article-title":"The llama 3 herd of models","author":"Grattafiori","year":"2024","journal-title":"arXiv preprint arXiv:2407.21783"},{"key":"ref16","article-title":"Deepseek-r1: Incentivizing reasoning capability in 11 ms via reinforcement learning","author":"Guo","year":"2025","journal-title":"arXiv preprint arXiv:2501.12948"},{"key":"ref17","article-title":"Harder tasks need more experts: Dynamic routing in moe models","author":"Huang","year":"2024","journal-title":"arXiv preprint arXiv:2403.07652"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref19","article-title":"Large language model inference acceleration: A comprehensive hardware perspective","author":"Li","year":"2024","journal-title":"arXiv preprint arXiv:2410.04466"},{"key":"ref20","article-title":"Deepseek-v3 technical report","volume-title":"arXiv preprint arXiv:2412.19437","author":"Liu","year":"2024"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.3102\/1076998619872761"},{"key":"ref23","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume":"32","author":"Paszke","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref24","article-title":"A cpu-centric perspective on agentic ai","author":"Raj","year":"2025","journal-title":"arXiv preprint arXiv:2511.00739"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3440689"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2997"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0377"},{"key":"ref28","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv preprint arXiv:2302.13971"},{"key":"ref29","article-title":"Eliminating multi-gpu performance taxes: A systems approach to efficient distributed 11 ms","author":"Trifan","year":"2025","journal-title":"arXiv preprint arXiv:2511.02168"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS64960.2025.00015"},{"key":"ref31","article-title":"A systematic characterization of 11 m inference on gpus","author":"Wang","year":"2025","journal-title":"arXiv preprint arXiv:2512.01644"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1177\/0018720813475812"},{"key":"ref33","article-title":"React: Synergizing reasoning and acting in language models","volume-title":"The eleventh international conference on learning representations","author":"Yao"},{"key":"ref34","first-page":"521","article-title":"Orca: A distributed serving system for {TransformerBased} generative models","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu"},{"key":"ref35","article-title":"Toward inference-optimal mixture-of-expert large language models","author":"Yun","year":"2024","journal-title":"arXiv preprint arXiv:2404.02852"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3639032"},{"key":"ref37","article-title":"The hidden bloat in machine learning systems","author":"Zhang","year":"2025","journal-title":"arXiv preprint arXiv:2503.14226"}],"event":{"name":"2026 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)","location":"Seoul, Korea, Republic of","start":{"date-parts":[[2026,4,26]]},"end":{"date-parts":[[2026,4,28]]}},"container-title":["2026 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11527204\/11527232\/11527256.pdf?arnumber=11527256","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T05:36:22Z","timestamp":1779860182000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11527256\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/ispass69572.2026.00014","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]}}}