{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T23:22:02Z","timestamp":1780356122810,"version":"3.54.1"},"reference-count":71,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,31]]},"DOI":"10.1109\/hpca68181.2026.11408490","type":"proceedings-article","created":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T20:47:22Z","timestamp":1772657242000},"page":"1-17","source":"Crossref","is-referenced-by-count":1,"title":["RPU \u2013 A Reasoning Processing Unit"],"prefix":"10.1109","author":[{"given":"Matthew Joseph","family":"Adiletta","sequence":"first","affiliation":[{"name":"Harvard University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gu-Yeon","family":"Wei","sequence":"additional","affiliation":[{"name":"Harvard University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"David","family":"Brooks","sequence":"additional","affiliation":[{"name":"Harvard University"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Introduction to torch.compile - PyTorch Tutorials 2.6.0+cu124 documentation","year":"2023"},{"key":"ref2","volume-title":"Groq First Generation 14 nm Chip Just Got a 6x Speed Boost: Introducing Llama 3.3 70B Speculative Decoding on GroqCloud \u2122 - Groq is Fast AI Inference","year":"2024"},{"key":"ref3","volume-title":"NVIDIA Data Center Deep Learning Product Performance AI Inference","year":"2024"},{"key":"ref4","volume-title":"Hardware Benchmarking & Performance Analysis","year":"2025"},{"key":"ref5","volume-title":"neuralmagic (Neural Magic)","year":"2025"},{"key":"ref6","volume-title":"NVIDIA Blackwell","year":"2025"},{"key":"ref7","doi-asserted-by":"crossref","first-page":"78 120","DOI":"10.1109\/ACCESS.2019.2917698","article-title":"A Survey of Computer Architecture Simulation Techniques and Tools","volume":"7","author":"Akram","year":"2019","journal-title":"IEEE Access"},{"key":"ref8","article-title":"The Relationship Between Reasoning and Performance in Large Language Models - o3 (mini) Thinks Harder, Not Longer","volume-title":"arXiv","author":"Ballon","year":"2025"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/2678373.2665705"},{"key":"ref10","doi-asserted-by":"crossref","DOI":"10.1145\/3713082.3730390","article-title":"Good things come in small packages: Should we build AI clusters with Lite-GPUs?","volume-title":"arXiv","author":"Canakci","year":"2025"},{"key":"ref11","first-page":"73","article-title":"Architecting an EnergyEfficient DRAM System for GPUs","volume-title":"2017 IEEE International Symposium on High Performance Computer Architecture (HPCA)","author":"Chatterjee"},{"key":"ref12","article-title":"Do NOT Think That Much for 2+3= ? On the Overthinking of o1-Like LLMs","volume-title":"arXiv","author":"Chen","year":"2025"},{"issue":"2","key":"ref13","doi-asserted-by":"crossref","first-page":"113","DOI":"10.1109\/LCA.2023.3305386","article-title":"Unleashing the Potential of PIM: Accelerating Large Batched Inference of Transformer-Based Generative Models","volume":"22","author":"Choi","year":"2023","journal-title":"IEEE Computer Architecture Letters"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2023.3256796"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589351"},{"issue":"9","key":"ref16","first-page":"1423","article-title":"Universal Chiplet Interconnect Express (UCIe): An Open Industry Standard for Innovations With Chiplets at Package Level","volume-title":"IEEE Transactions on Components, Packaging and Manufacturing Technology","volume":"12","author":"Das Sharma"},{"key":"ref17","volume-title":"Introducing NVIDIA Dynamo, A Low-Latency Distributed Inference Framework for Scaling Reasoning AI Models","author":"Elmeleegy","year":"2025"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3710848.3710871"},{"key":"ref19","volume-title":"TSMC Details 3 nm Process Technology: Full Node Scaling for 2H22 Volume Production","author":"Frumusanu","year":"2020"},{"key":"ref20","article-title":"Distil-Whisper: Robust Knowledge Distillation via Large-Scale Pseudo Labelling","volume-title":"arXiv","author":"Gandhi","year":"2023"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s12599-022-00755-x"},{"key":"ref22","first-page":"42","article-title":"Ponte Vecchio: A Multi-Tile 3D Stacked Processor for Exascale Computing","volume-title":"2022 IEEE International Solid-State Circuits Conference (ISSCC)","volume":"65","author":"Gomes"},{"issue":"5","key":"ref23","first-page":"41","article-title":"On-Chip Interconnection Networks of the TRIPS Chip","volume-title":"IEEE Micro","volume":"27","author":"Gratz"},{"key":"ref24","first-page":"722","article-title":"NeuPIMs: NPU-PIM Heterogeneous Acceleration for Batched LLM Inferencing","volume-title":"Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"3","author":"Heo"},{"key":"ref25","first-page":"1022","article-title":"CoWoS Architecture Evolution for Next Generation HPC on 2.5D System in Package","volume-title":"2023 IEEE 73rd Electronic Components and Technology Conference (ECTC)","author":"Hu"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"ref27","first-page":"1","article-title":"Intel Gaudi 3 AI Accelerator: Architected for Gen AI Training and Inference","volume-title":"2024 IEEE Hot Chips 36 Symposium (HCS)","author":"Kaplan"},{"issue":"11","key":"ref28","first-page":"3242","article-title":"An Energy-Efficient Design of TSV I\/O for HBM With a Data Rate up to 10 Gb\/s","volume-title":"IEEE Journal of Solid-State Circuits","volume":"58","author":"Kim"},{"key":"ref29","first-page":"1","article-title":"Present and Future, Challenges of High Bandwith Memory (HBM)","volume-title":"2024 IEEE International Memory Workshop (IMW)","author":"Kim"},{"key":"ref30","first-page":"368","article-title":"A case for exploiting subarray-level parallelism (SALP) in DRAM","volume-title":"2012 39th Annual International Symposium on Computer Architecture (ISCA)","author":"Kim"},{"key":"ref31","article-title":"Exploiting the DRAM Microarchitecture to Increase Memory-Level Parallelism","volume-title":"arXiv","author":"Kim","year":"2018"},{"key":"ref32","first-page":"1","article-title":"Graphcore","volume-title":"2021 IEEE Hot Chips 33 Symposium (HCS)","author":"Knowles"},{"key":"ref33","first-page":"57","article-title":"Performance Modeling and Workload Analysis of Distributed Large Language Model Training and Inference","volume-title":"2024 IEEE International Symposium on Workload Characterization (IISWC)","author":"Kundu"},{"key":"ref34","doi-asserted-by":"crossref","DOI":"10.1145\/3600006.3613165","article-title":"Efficient Memory Management for Large Language Model Serving with PagedAttention","volume-title":"arXiv","author":"Kwon","year":"2023"},{"key":"ref35","first-page":"238","article-title":"13.4 A 48 GB 16-High 1280 GB \/s HBM3E DRAM with All-Around Power TSV and a 6-Phase RDQS Scheme for TSV Area Optimization","volume-title":"2024 IEEE International Solid-State Circuits Conference (ISSCC)","volume":"67","author":"Lee"},{"key":"ref36","first-page":"43","article-title":"Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology: Industrial Product","volume-title":"2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA)","author":"Lee"},{"key":"ref37","article-title":"Fast Inference from Transformers via Speculative Decoding","volume-title":"arXiv","author":"Leviathan","year":"2023"},{"key":"ref38","article-title":"A Performance Model for Warp Specialization Kernels","volume-title":"arXiv","author":"Liu","year":"2025"},{"key":"ref39","article-title":"Nanoscaling Floating-Point (NxFP): NanoMantissa, Adaptive Microexponents, and Code Recycling for Direct-Cast Compression of Large Language Models","author":"Lo","year":"2024","journal-title":"arXiv"},{"key":"ref40","first-page":"557","article-title":"Embedded Multi-die Interconnect Bridge (EMIB) - A High Density, High Bandwidth Packaging Interconnect","volume-title":"2016 IEEE 66th Electronic Components and Technology Conference (ECTC)","author":"Mahajan"},{"key":"ref41","article-title":"Accelerating Speculative Decoding using Dynamic Speculation Length","volume-title":"arXiv","author":"Mamou","year":"2024"},{"key":"ref42","first-page":"1","article-title":"Mitigating Response Delays in Free-Form Conversations with LLM-powered Intelligent Virtual Agents","volume-title":"Proceedings of the 7th ACM Conference on Conversational User Interfaces","author":"Maslych"},{"key":"ref43","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/IEDM45741.2023.10413890","article-title":"Advanced Packaging Technologies in Memory Applications for Future Generative AI Era","volume-title":"2023 International Electron Devices Meeting (IEDM)","author":"Moon","year":"2023"},{"key":"ref44","volume-title":"Slow AI: Designing User Control for Long Tasks","author":"Nielsen","year":"2025"},{"key":"ref45","first-page":"41","article-title":"Fine-Grained DRAM: EnergyEfficient DRAM for Extreme Bandwidth Systems","volume-title":"2017 50th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO)","author":"O\u2019Connor"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640422"},{"issue":"1","key":"ref47","first-page":"256","article-title":"A 192-Gb 12-High 896-GB\/s HBM3 DRAM With a TSV Auto-Calibration Scheme and Machine-Learning-Based Layout Optimization","volume-title":"IEEE Journal of Solid-State Circuits","volume":"58","author":"Park"},{"key":"ref48","volume-title":"The Memory Wall: Past, Present, and Future of DRAM","author":"Patel","year":"2024"},{"key":"ref49","volume-title":"AI Server Cost Analysis - Memory Is The Biggest Loser","author":"Patel","year":"2023"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"issue":"1","key":"ref51","doi-asserted-by":"crossref","first-page":"43","DOI":"10.1109\/JSSC.2018.2875092","article-title":"A $1.17-\\text{pJ} \/ \\mathrm{b}, 25-\\text{Gb} \/ \\mathrm{s} \/$ pin Ground-Referenced Single-Ended Serial Link for Off- and On-Package Communication Using a Process- and Temperature-Adaptive Voltage Regulator","volume":"54","author":"Poulton","year":"2019","journal-title":"IEEE Journal of Solid-State Circuits"},{"key":"ref52","first-page":"1353","article-title":"SambaNova SN40L: Scaling the AI Memory Wall with Dataflow and Composition of Experts","volume-title":"2024 57th IEEE\/ACM International Symposium on Microarchitecture (MICRO)","author":"Prabhakar"},{"key":"ref53","first-page":"10271","volume-title":"Pushing the limits of narrow precision inferencing at cloud scale with microsoft floating point","author":"Rouhani","year":"2020"},{"issue":"4","key":"ref54","first-page":"1051","article-title":"A 16 GB 1024 GB\/s HBM3 DRAM With Source-Synchronized Bus Design and On-Die Error Control Scheme for Enhanced RAS Features","volume-title":"IEEE Journal of Solid-State Circuits","volume":"58","author":"Ryu"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/2508148.2485963"},{"key":"ref56","volume-title":"InferenceMAX by SemiAnalysis"},{"key":"ref57","volume-title":"Boost Llama 3.3 70B Inference Throughput 3x with NVIDIA TensorRT-LLM Speculative Decoding","author":"Shah","year":"2024"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358302"},{"key":"ref59","volume-title":"TSMC Details 5 nm Process Tech: Aggressive Scaling, But Thin Power and Performance Gains","author":"Shilov","year":"2018"},{"key":"ref60","volume-title":"TSMC\u2019s Roadmap at a Glance: N3X, N2P, A16 Coming in 2025\/2026","author":"Shilov","year":"2024"},{"key":"ref61","first-page":"1","article-title":"AMD Instinct\u2122 MI300X Accelerator: Packaging and Architecture Co-Optimization","volume-title":"2024 IEEE Symposium on VLSI Technology and Circuits (VLSI Technology and Circuits)","author":"Smith"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480095"},{"key":"ref63","first-page":"1","article-title":"NVIDIA Blackwell Platform: Advancing Generative AI and Accelerated Computing","volume-title":"2024 IEEE Hot Chips 36 Symposium (HCS)","author":"Tirumala"},{"key":"ref64","volume-title":"Cerebras Inference now 3x faster: Llama3.1\u201370B breaks 2,100 tokens\/s - Cerebras","author":"Wang","year":"2024"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1145\/3567955.3567959"},{"key":"ref66","article-title":"Thoughts Are All Over the Place: On the Underthinking of o1Like LLMs","volume-title":"arXiv","author":"Wang","year":"2025"},{"key":"ref67","article-title":"Chain-of-Thought Prompting Elicits Reasoning in Large Language Models","author":"Wei","year":"2023","journal-title":"arXiv"},{"key":"ref68","first-page":"1080","article-title":"LLMCompass: Enabling Efficient Hardware Design for Large Language Model Inference","volume-title":"2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA)","author":"Zhang"},{"key":"ref69","first-page":"861","article-title":"CAMEL: Co-Designing AI Models and eDRAMs for Efficient On-Device Learning","volume-title":"2024 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","author":"Zhang"},{"key":"ref70","first-page":"1071","article-title":"TransPIM: A Memory-based Acceleration via Software-Hardware Co-Design for Transformer","volume-title":"2022 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","author":"Zhou"},{"key":"ref71","first-page":"C300","article-title":"A 0.11 pJ\/Op, 0.32\u2013128 TOPS, Scalable Multi-Chip-Module-based Deep Neural Network Accelerator with Ground-Reference Signaling in 16nm","volume-title":"2019 Symposium on VLSI Circuits","author":"Zimmer"}],"event":{"name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","location":"Sydney, Australia","start":{"date-parts":[[2026,1,31]]},"end":{"date-parts":[[2026,2,4]]}},"container-title":["2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11408404\/11408433\/11408490.pdf?arnumber=11408490","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T06:50:25Z","timestamp":1772693425000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11408490\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":71,"URL":"https:\/\/doi.org\/10.1109\/hpca68181.2026.11408490","relation":{},"subject":[],"published":{"date-parts":[[2026,1,31]]}}}