{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T12:35:09Z","timestamp":1774096509285,"version":"3.50.1"},"reference-count":159,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100017610","name":"Shenzhen Science and Technology Innovation Commission","doi-asserted-by":"publisher","award":["KJZD20230923113300002"],"award-info":[{"award-number":["KJZD20230923113300002"]}],"id":[{"id":"10.13039\/501100017610","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/access.2025.3609769","type":"journal-article","created":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T17:35:04Z","timestamp":1758044104000},"page":"173116-173145","source":"Crossref","is-referenced-by-count":1,"title":["A Component-Centric Perspective on Hardware Accelerators for LLMs"],"prefix":"10.1109","volume":"13","author":[{"given":"Jia","family":"Ke","sequence":"first","affiliation":[{"name":"School of Microelectronics, Southern University of Science and Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4389-7057","authenticated-orcid":false,"given":"Wang","family":"Xiaohao","sequence":"additional","affiliation":[{"name":"Graduate School at Shenzhen, Tsinghua University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chen","family":"Hailin","sequence":"additional","affiliation":[{"name":"School of Microelectronics, Southern University of Science and Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhong","family":"Wei","sequence":"additional","affiliation":[{"name":"ICube, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Li","family":"Xinxiong","sequence":"additional","affiliation":[{"name":"ICube, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fang","family":"Zenan","sequence":"additional","affiliation":[{"name":"ICube, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7554-7938","authenticated-orcid":false,"given":"An","family":"Fengwei","sequence":"additional","affiliation":[{"name":"School of Microelectronics, Southern University of Science and Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Brown"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.14778\/3570690.3570697"},{"key":"ref3","first-page":"1","article-title":"Nimble: Lightweight and parallel GPU task scheduling for deep learning","volume-title":"Proc. 34th Int. Conf. Neural Inf. Process. Syst.","author":"Kwon"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS53621.2022.00065"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2019.2935967"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.58496\/BJAI\/2023\/005"},{"key":"ref7","article-title":"How hungry is AI? Benchmarking energy, water, and carbon footprint of LLM inference","author":"Jegham","year":"2025","journal-title":"arXiv:2505.09598"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3656177"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.3934\/era.2022192"},{"key":"ref10","article-title":"A comprehensive performance study of large language models on novel AI accelerators","author":"Emani","year":"2023","journal-title":"arXiv:2310.04607"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TAI.2024.3377147"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.3390\/app15020586"},{"key":"ref13","volume-title":"Datalearner LLM Coding Leaderboard","year":"2025"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3289185"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3656177"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895626"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/SOCC49529.2020.9524802"},{"key":"ref18","article-title":"LLM inference unveiled: Survey and roofline model insights","author":"Yuan","year":"2024","journal-title":"arXiv:2402.16363"},{"key":"ref19","article-title":"ParetoQ: Scaling laws in extremely low-bit LLM quantization","author":"Liu","year":"2025","journal-title":"arXiv:2502.02631"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.433"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19775-8_12"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01201"},{"key":"ref23","article-title":"How to parameterize asymmetric quantization ranges for quantization-aware training","author":"You","year":"2024","journal-title":"arXiv:2404.16898"},{"key":"ref24","article-title":"GPTAQ: Efficient finetuning-free quantization for asymmetric calibration","author":"Li","year":"2025","journal-title":"arXiv:2504.02692"},{"key":"ref25","article-title":"The uniqueness of LLaMA3\u201370B series with per-channel quantization","author":"Qin","year":"2024","journal-title":"arXiv:2408.15301"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414076"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00157"},{"key":"ref28","article-title":"I&S-ViT: An inclusive & stable method for pushing the limit of post-training ViTs quantization","author":"Zhong","year":"2023","journal-title":"arXiv:2311.10126"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICAIRC64177.2024.10899941"},{"key":"ref30","first-page":"35","article-title":"Practical edge kernels for integer-only vision transformers under post-training quantization","volume":"5","author":"Zhang","year":"2023","journal-title":"Proc. Mach. Learn. Syst."},{"key":"ref31","first-page":"66357","article-title":"OneBit: Towards extremely low-bit large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"37","author":"Xu"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3676536.3676796"},{"key":"ref33","article-title":"MixPE: Quantization and hardware co-design for efficient LLM inference","author":"Zhang","year":"2024","journal-title":"arXiv:2411.16158"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/iiswc59245.2023.00026"},{"key":"ref35","article-title":"Enabling unstructured sparse acceleration on structured sparse accelerators","author":"Jeong","year":"2024","journal-title":"arXiv:2403.07953"},{"key":"ref36","article-title":"Dynamic sparse training with structured sparsity","author":"Lasby","year":"2023","journal-title":"arXiv:2305.02299"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3447085"},{"key":"ref38","first-page":"10323","article-title":"SparseGPT: Massive language models can be accurately pruned in one-shot","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Frantar"},{"key":"ref39","article-title":"SliceGPT: Compress large language models by deleting rows and columns","author":"Ashkboos","year":"2024","journal-title":"arxiv:401.15024"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i10.28960"},{"key":"ref41","first-page":"21702","article-title":"LLM-pruner: On the structural pruning of large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ma"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.178"},{"key":"ref43","article-title":"Everybody prune now: Structured pruning of LLMs with only forward passes","author":"Dery","year":"2024","journal-title":"arXiv:2402.05406"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC55821.2022.9926300"},{"key":"ref45","article-title":"Accelerating transformer inference and training with 2:4 activation sparsity","author":"Haziza","year":"2025","journal-title":"arXiv:2503.16672"},{"key":"ref46","article-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","author":"Shazeer","year":"2017","journal-title":"arXiv:1701.06538"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.70"},{"key":"ref48","article-title":"Mixture of experts models in deep learning and their techniques applications and challenges","author":"Sankar","year":"2025","journal-title":"TechRxiv"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00092"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/SCW63240.2024.00178"},{"key":"ref51","volume-title":"NVIDIA NIM Large Language Models Benchmarking","year":"2025"},{"key":"ref52","first-page":"16344","article-title":"FlashAttention: Fast and memory-efficient exact attention with IO-awareness","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dao"},{"key":"ref53","article-title":"FlashAttention-2: Faster attention with better parallelism and work partitioning","author":"Dao","year":"2023","journal-title":"arXiv:2307.08691"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575747"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00043"},{"issue":"4","key":"ref57","first-page":"1","article-title":"DNNFusion: Accelerating deep neural networks execution with advanced operator fusion","volume":"17","author":"Niu","year":"2020","journal-title":"ACM Trans. Archit. Code Optim."},{"key":"ref58","article-title":"Inference with reference: Lossless acceleration of large language models","author":"Yang","year":"2023","journal-title":"arXiv:2304.04487"},{"key":"ref59","first-page":"92","article-title":"Accelerating transformer networks through recomposing softmax layers","volume-title":"Proc. IEEE Int. Symp. Workload Characterization","author":"Choi"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1145\/3370748.3406567"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/3431920.3439477"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC58850.2023.00039"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/dac56929.2023.10247678"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS58744.2024.10558631"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2022.3197282"},{"key":"ref66","article-title":"VAQF: Fully automatic software\u2013hardware co-design framework for low-bit vision transformer","author":"Sun","year":"2022","journal-title":"arXiv:2201.06618"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ASP-DAC58780.2024.10473931"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/FPL60245.2023.00048"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3626202.3637562"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/TCSII.2024.3462560"},{"key":"ref71","first-page":"1","article-title":"SALTS: An efficient and flexible self-attention accelerator with long token support on FPGA","volume-title":"Proc. IEEE 17th Int. Conf. Solid-State Integr. Circuit Technol.","author":"Chen"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICTAI56018.2022.00095"},{"key":"ref73","first-page":"1311","article-title":"Funnel: An efficient sparse attention accelerator with multi-dataflow fusion","volume-title":"Proc. IEEE Int. Symp. Parallel Distrib. Process. Appl.","author":"Ma"},{"key":"ref74","first-page":"1","article-title":"UbiMoE: A ubiquitous mixture-of-experts vision transformer accelerator with hybrid computation pattern on FPGA","volume-title":"Proc. IEEE Int. Symp. Circuits Syst. (ISCAS)","author":"Dong"},{"key":"ref75","first-page":"1","article-title":"ITA: An energy-efficient attention and softmax accelerator for quantized transformers","volume-title":"Proc. IEEE\/ACM Int. Symp. Low Power Electron. Des.","author":"Islamoglu"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1145\/3649329.3655982"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00111"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/hpca47549.2020.00035"},{"key":"ref79","first-page":"692","article-title":"ELSA: hardware\u2013software co-design for efficient, lightweight self-attention mechanism in neural networks","volume-title":"Proc. ACM\/IEEE 48th Annu. Int. Symp. Comput. Archit. (ISCA)","author":"Ham"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480125"},{"key":"ref81","first-page":"97","article-title":"SpAtten: Efficient sparse attention architecture with cascade token and head pruning","volume-title":"Proc. IEEE Int. Symp. High-Perform. Comput. Archit. (HPCA)","author":"Wang"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2023.3273992"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2023.3299509"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1145\/3649219"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247913"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2023.3337777"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640422"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2023.3305663"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1038\/s44335-024-00004-2"},{"key":"ref90","first-page":"1","article-title":"ReTransformer: ReRAM-based processing-in-memory architecture for transformer acceleration","volume-title":"Proc. IEEE\/ACM Int. Conf. Comput. Aided Design (ICCAD)","author":"Yang"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00082"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/TCSI.2019.2945617"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2019.2908101"},{"key":"ref94","first-page":"802","article-title":"FloatPIM: In-memory acceleration of deep neural network training with high precision","volume-title":"Proc. ACM\/IEEE 46th Annu. Int. Symp. Comput. Archit. (ISCA)","author":"Imani"},{"key":"ref95","first-page":"288","article-title":"DRISA: A DRAM-based reconfigurable in-situ accelerator","volume-title":"Proc. 50th Annu. IEEE\/ACM Int. Symp. Microarchitecture (MICRO)","author":"Li"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2023.3282046"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1145\/2996864"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.58"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1145\/2872887.2750389"},{"key":"ref100","volume-title":"LPU Chip Used By Groq Ai Offers Record Performance","year":"2024"},{"key":"ref101","volume-title":"Rockchip NPU RK-Series","year":"2024"},{"key":"ref102","volume-title":"AMD Instinct\u2013MI300x Accelerators","year":"2025"},{"key":"ref103","volume-title":"Meet Sohu: The World\u2019s First Transformer ASIC","year":"2024"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3455008"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref106","article-title":"Ring attention with blockwise transformers for near-infinite context","author":"Liu","year":"2023","journal-title":"arXiv:2310.01889"},{"key":"ref107","volume-title":"Fairscale: A General Purpose Modular PyTorch Library for High Performance and Large Scale Training","year":"2021"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330701"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640366"},{"key":"ref110","first-page":"578","article-title":"TVM: An automated end-to-end optimizing compiler for deep learning","volume-title":"Proc. 13th USENIX Symp. Operating Syst. Design Implement.","author":"Chen"},{"key":"ref111","first-page":"1","article-title":"MNN: A universal and efficient inference engine","volume":"2","author":"Jiang","year":"2020","journal-title":"Proc. Mach. Learn. Syst."},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN54540.2023.10191521"},{"key":"ref113","first-page":"1","article-title":"SqueezeLLM: Dense-and-sparse quantization","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Kim"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1145\/3676536.3676743"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/SCW63240.2024.00074"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1145\/2644865.2541967"},{"key":"ref117","article-title":"Cooperative hardware\/software caching for next-generation memory systems","author":"Wang","year":"2004"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1145\/3665314.3670814"},{"key":"ref119","article-title":"Vision transformer accelerator ASIC for real-time low-power sleep staging","author":"Robitaille","year":"2025","journal-title":"arXiv:2502.16334"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1145\/3649329.3656507"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507738"},{"issue":"4","key":"ref122","doi-asserted-by":"crossref","first-page":"1290","DOI":"10.1109\/TCAD.2024.3483092","article-title":"OPASCA: Outer product-based accelerator with unified architecture for sparse convolution and attention","volume":"44","author":"Zhou","year":"2025","journal-title":"IEEE Trans. Comput.-Aided Design Integr. Circuits Syst."},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/ICFPT64416.2024.11113430"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1145\/3676536.3676766"},{"key":"ref125","first-page":"45777","article-title":"Pre-RMSNorm and pre-CRMSNorm transformers: Equivalent and efficient pre-LN transformers","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Jiang"},{"key":"ref126","article-title":"You can remove GPT2\u2019s LayerNorm by fine-tuning","author":"Heimersheim","year":"2024","journal-title":"arXiv:2409.13710"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/CLOUD67622.2025.00036"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/ICFPT64416.2024.11113456"},{"key":"ref129","article-title":"The feasibility of implementing large-scale transformers on multi-FPGA platforms","author":"Gao","year":"2024","journal-title":"arXiv:2404.16158"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2006.884574"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.35848\/1347-4065\/ad93e0"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2025.3553069"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD50377.2020.00079"},{"key":"ref134","article-title":"Memory is all you need: An overview of compute-in-memory architectures for accelerating large language model inference","author":"Wolters","year":"2024","journal-title":"arXiv:2406.08413"},{"key":"ref135","article-title":"Enabling the adoption of processing-in-memory: Challenges, mechanisms, future research directions","author":"Ghose","year":"2018","journal-title":"arXiv:1802.00320"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-024-55220-y"},{"key":"ref137","article-title":"A survey on quantum machine learning: Current trends, challenges, opportunities, and the road ahead","author":"Zaman","year":"2023","journal-title":"arXiv:2310.10315"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2025.107953"},{"key":"ref139","first-page":"117","article-title":"Taming throughput-latency tradeoff in LLM inference with Sarathi\u2013Serve","volume-title":"Proc. 18th USENIX Symp. Operating Syst. Design Implement.","author":"Agrawal"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"ref141","article-title":"Throughput-optimal scheduling algorithms for LLM inference and AI agents","author":"Li","year":"2025","journal-title":"arXiv:2504.07347"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1145\/3725394"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001177"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1145\/3511094"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750397"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.23919\/DATE58400.2024.10546582"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2023.3345651"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/MNET.2024.3449276"},{"key":"ref149","article-title":"Addition is all you need for energy-efficient language models","author":"Luo","year":"2024","journal-title":"arXiv:2410.00907"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1007\/s10444-025-10232-0"},{"key":"ref151","article-title":"Changing base without losing pace: A GPU-efficient alternative to MatMul in DNNs","author":"Ailon","year":"2025","journal-title":"arXiv:2503.12211"},{"key":"ref152","article-title":"EcoTransformer: Attention without multiplication","author":"Gao","year":"2025","journal-title":"arXiv:2507.20096"},{"key":"ref153","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020","journal-title":"arXiv:2001.08361"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00078"},{"key":"ref155","article-title":"HarMoEny: Efficient multi-GPU inference of MoE models","author":"Doucet","year":"2025","journal-title":"arXiv:2506.12417"},{"key":"ref156","article-title":"MoETuner: Optimized mixture of expert serving with balanced expert placement and token routing","author":"Go","year":"2025","journal-title":"arXiv:2502.06643"},{"key":"ref157","volume-title":"Companies Rally RISV-V Support for AI and HPC Applications","author":"Nijhawan","year":"2024"},{"key":"ref158","volume-title":"RISC-V & High Performance Computing (HPC)","year":"2025"},{"key":"ref159","article-title":"MARVEL: An end-to-end framework for generating model-class aware custom RISC-V extensions for lightweight AI","author":"Kumar M","year":"2025","journal-title":"arXiv:2508.01800"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6287639\/10820123\/11164709.pdf?arnumber=11164709","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,10]],"date-time":"2025-10-10T05:00:27Z","timestamp":1760072427000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11164709\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":159,"URL":"https:\/\/doi.org\/10.1109\/access.2025.3609769","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}