{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,19]],"date-time":"2026-06-19T02:41:49Z","timestamp":1781836909649,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","funder":[{"name":"NSFC","award":["92373205"],"award-info":[{"award-number":["92373205"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756059","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:21:19Z","timestamp":1760721679000},"page":"675-690","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Titan-I: An Open-Source, High Performance RISC-V Vector Core"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2488-0693","authenticated-orcid":false,"given":"Jiuyang","family":"Liu","sequence":"first","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China and Xinpian Technology Co., Ltd., Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8212-362X","authenticated-orcid":false,"given":"Qinjun","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences, Beijing, Chile and Xinpian Technology Co., Ltd., Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3366-3722","authenticated-orcid":false,"given":"Yunqian","family":"Luo","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3826-9758","authenticated-orcid":false,"given":"Hongbin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3675-3798","authenticated-orcid":false,"given":"Jiongjia","family":"Lu","sequence":"additional","affiliation":[{"name":"Xinpian Technology Co., Ltd., Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5314-3002","authenticated-orcid":false,"given":"Shupei","family":"Fan","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0221-6408","authenticated-orcid":false,"given":"Jianhao","family":"Ye","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3182-2290","authenticated-orcid":false,"given":"Yang","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5045-7884","authenticated-orcid":false,"given":"Xiaoyi","family":"Liu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5905-6866","authenticated-orcid":false,"given":"Yanqi","family":"Yang","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3623-3554","authenticated-orcid":false,"given":"Zewen","family":"Ye","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7217-4620","authenticated-orcid":false,"given":"Yuhang","family":"Zeng","sequence":"additional","affiliation":[{"name":"Xinpian Technology Co., Ltd., Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1128-4664","authenticated-orcid":false,"given":"Ao","family":"Shen","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8719-3464","authenticated-orcid":false,"given":"Rui","family":"Huang","sequence":"additional","affiliation":[{"name":"Xinpian Technology Co., Ltd., Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6077-8707","authenticated-orcid":false,"given":"Wei","family":"Cong","sequence":"additional","affiliation":[{"name":"UCUN Technology Inc, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6404-5270","authenticated-orcid":false,"given":"Xuecheng","family":"Zou","sequence":"additional","affiliation":[{"name":"Henan Academy of Sciences, Zhengzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8433-7281","authenticated-orcid":false,"given":"Mingyu","family":"Gao","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Hossein Amiri and Asadollah Shahbahrami. 2020. SIMD programming using Intel vector extensions. J. Parallel and Distrib. Comput. 135 (2020) 83\u2013100.","DOI":"10.1016\/j.jpdc.2019.09.012"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","unstructured":"D.\u00a0W. Anderson F.\u00a0J. Sparacio and R.\u00a0M. Tomasulo. 1967. The IBM System\/360 Model 91: Machine Philosophy and Instruction-Handling. IBM Journal of Research and Development 11 1 (Jan. 1967) 8\u201324. 10.1147\/rd.111.0008","DOI":"10.1147\/rd.111.0008"},{"key":"e_1_3_3_1_4_2","volume-title":"Architecture Specification Language Reference","author":"Limited ARM","year":"2024","unstructured":"ARM Limited. 2024. Architecture Specification Language Reference. Arm Limited. https:\/\/developer.arm.com\/documentation\/ddi0626."},{"key":"e_1_3_3_1_5_2","unstructured":"Krste Asanovic Andrew Waterman Colin Schmidt Albert Ou and Alon Amid. 2021. RISC-V Vector Extension 1.0. https:\/\/github.com\/riscv\/riscv-v-spec\/releases\/tag\/v1.0."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/2228360.2228584"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-14390-8_50"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-14390-8_50"},{"key":"e_1_3_3_1_9_2","unstructured":"Brad Burgess. 2023. Detailed Architecture Analysis and Key Features of SiFive\u2019s latest high-performance out-of-order Vector Processor. https:\/\/hc2023.hotchips.org\/."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","unstructured":"Matheus Cavalcante Fabian Schuiki Florian Zaruba Michael Schaffner and Luca Benini. 2020. Ara: A 1-GHz+ Scalable and Energy-Efficient RISC-V Vector Processor With Multiprecision Floating-Point Support in 22-nm FD-SOI. 28 2 (2020) 530\u2013543. 10.1109\/TVLSI.2019.2950087","DOI":"10.1109\/TVLSI.2019.2950087"},{"key":"e_1_3_3_1_11_2","unstructured":"Andes\u00a0Technology Corporation. 2025. AndesCore\u2122 AX45. https:\/\/www.andestech.com\/en\/products-solutions\/andescore-processors\/riscv-ax45\/."},{"key":"e_1_3_3_1_12_2","volume-title":"Workshop on Open-Source EDA Technology (WOSET)","author":"Eldridge Schuyler","year":"2021","unstructured":"Schuyler Eldridge, Prithayan Barua, Aliaksei Chapyzhenka, Adam Izraelevitz, Jack Koenig, Chris Lattner, Andrew Lenharth, George Leontiev, Fabian Schuiki, Ram Sunder, et\u00a0al. 2021. MLIR as hardware compiler infrastructure. In Workshop on Open-Source EDA Technology (WOSET) , Vol.\u00a03."},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.5555\/238171"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","unstructured":"John\u00a0L. Hennessy and David\u00a0A. Patterson. 2019. A new golden age for computer architecture. Commun. ACM 62 2 (Jan. 2019) 48\u201360. 10.1145\/3282307","DOI":"10.1145\/3282307"},{"key":"e_1_3_3_1_15_2","unstructured":"Rodrigo Huerta Mojtaba\u00a0Abaie Shoushtary Jos\u00e9-Lorenzo Cruz and Antonio Gonz\u00e1lez. 2025. Analyzing Modern NVIDIA GPU cores. arxiv:https:\/\/arXiv.org\/abs\/2503.20481\u00a0[cs.AR] https:\/\/arxiv.org\/abs\/2503.20481"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"RN Ibbett NP Topham RN Ibbett and NP Topham. 1989. The CDC Series. Architecture of High Performance Computers: Volume I: Uniprocessors and vector processors (1989) 156\u2013179.","DOI":"10.1007\/978-1-349-19757-6_9"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3624229"},{"key":"e_1_3_3_1_18_2","unstructured":"Apple Inc. 2022. Mac Studio (2022) - Technical Specifications. https:\/\/support.apple.com\/en-us\/111900."},{"key":"e_1_3_3_1_19_2","unstructured":"MediaTek Inc. 2025. MediaTek Dimensity 8400 First All Big Core Chip for Premium Smartphones. https:\/\/www.mediatek.com\/products\/smartphones\/mediatek-dimensity-8400."},{"key":"e_1_3_3_1_20_2","unstructured":"SiFive Inc. 2025. SiFive Intelligence X390. https:\/\/www.sifive.com\/cores\/intelligence-x390."},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3605731.3605904"},{"key":"e_1_3_3_1_22_2","unstructured":"Ju-Hung Li Jhih-Kuan Lin Yung-Cheng Su Chi-Wei Chu Lai-Tak Kuok Hung-Ming Lai Chao-Lin Lee and Jenq-Kuen Lee. 2023. SIMD Everywhere Optimization from ARM NEON to RISC-V Vector Extensions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.16509 (2023)."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Shang Li Zhiyuan Yang Dhiraj Reddy Ankur Srivastava and Bruce Jacob. 2020. DRAMsim3: A cycle-accurate thermal-capable DRAM simulator. IEEE Computer Architecture Letters 19 2 (2020) 106\u2013109.","DOI":"10.1109\/LCA.2020.2973991"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3677333.3678151"},{"key":"e_1_3_3_1_25_2","unstructured":"SpacemiT LLC. 2024. Key Stone K1. https:\/\/www.spacemit.com\/en\/key-stone-k1\/."},{"key":"e_1_3_3_1_26_2","unstructured":"Arm Ltd. 2020. Arm Scalable Vector Extension 2. https:\/\/developer.arm.com\/documentation\/102340\/0100\/."},{"key":"e_1_3_3_1_27_2","unstructured":"Arm Ltd. 2020. Learn the architecture - Introducing Neon. https:\/\/developer.arm.com\/documentation\/102474\/0100."},{"key":"e_1_3_3_1_28_2","unstructured":"Arm Ltd. 2022. Arm Scalable Vector Extension. https:\/\/developer.arm.com\/documentation\/102476\/0100\/."},{"key":"e_1_3_3_1_29_2","unstructured":"Arm Ltd. 2024. Arm Cortex-X925 Core Software Optimization Guide. https:\/\/developer.arm.com\/documentation\/109842\/."},{"key":"e_1_3_3_1_30_2","unstructured":"Google Ltd. 2018. Google Edge TPU. https:\/\/cloud.google.com\/edge-tpu."},{"key":"e_1_3_3_1_31_2","unstructured":"Intel Ltd. 2017. Intel\u00ae AVX-512 Instructions. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/articles\/technical\/intel-avx-512-instructions.html."},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSD.2019.00045"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Zevi Miller and James\u00a0B. Orlin. 1985. NP-completeness for minimizing maximum edge length in grid embeddings. Journal of algorithms 6 1 (1985) 10\u201316.","DOI":"10.1016\/0196-6774(85)90016-1"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Francesco Minervini Oscar Palomar Osman Unsal Enrico Reggiani Josue Quiroga Joan Marimon Carlos Rojas Roger Figueras Abraham Ruiz Alberto Gonzalez et\u00a0al. 2023. Vitruvius+: an area-efficient RISC-V decoupled vector coprocessor for high performance computing applications. ACM Transactions on Architecture and Code Optimization 20 2 (2023) 1\u201325.","DOI":"10.1145\/3575861"},{"key":"e_1_3_3_1_35_2","unstructured":"NVIDIA. 2025. nvbench: CUDA Kernel Benchmarking Library. https:\/\/github.com\/NVIDIA\/nvbench. Accessed: 2025-06-12."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASAP54787.2022.00017"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"Wyatt Praharenka David Pankratz Jo\u00e3o\u00a0PL De\u00a0Carvalho Ehsan Amiri and Jos\u00e9\u00a0Nelson Amaral. 2022. Vectorizing divergent control flow with active-lane consolidation on long-vector architectures. The Journal of Supercomputing 78 10 (2022) 12553\u201312588.","DOI":"10.1007\/s11227-022-04359-w"},{"key":"e_1_3_3_1_38_2","first-page":"548","volume-title":"International Conference on Vector and Parallel Processing","author":"Quintana Francisca","year":"1998","unstructured":"Francisca Quintana, Roger Espasa, and Mateo Valero. 1998. An ISA comparison between superscalar and vector processors. In International Conference on Vector and Parallel Processing. Springer, 548\u2013560."},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","unstructured":"Richard\u00a0M. Russell. 1978. The CRAY-1 computer system. Commun. ACM 21 1 (Jan. 1978) 63\u201372. 10.1145\/359327.359336","DOI":"10.1145\/359327.359336"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Larry Seiler Doug Carmean Eric Sprangle Tom Forsyth Michael Abrash Pradeep Dubey Stephen Junkins Adam Lake Jeremy Sugerman Robert Cavin et\u00a0al. 2008. Larrabee: a many-core x86 architecture for visual computing. ACM Transactions on Graphics (TOG) 27 3 (2008) 1\u201315.","DOI":"10.1145\/1360612.1360617"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3547276.3548513"},{"key":"e_1_3_3_1_42_2","unstructured":"Mojtaba\u00a0Abaie Shoushtary Jordi\u00a0Tubella Murgadas and Antonio Gonzalez. 2024. Control Flow Management in Modern GPUs. arxiv:https:\/\/arXiv.org\/abs\/2407.02944\u00a0[cs.AR] https:\/\/arxiv.org\/abs\/2407.02944"},{"key":"e_1_3_3_1_43_2","unstructured":"Xuantie Team. 2023. Xuantie C906. https:\/\/www.xrvm.com\/product\/xuantie\/C906."},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCTIS58954.2023.10213200"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Jing Xia Chuanning Cheng Xiping Zhou Yuxing Hu and Peter Chun. 2021. Kunpeng 920: The first 7-nm chiplet-based 64-core arm soc for cloud services. IEEE Micro 41 5 (2021) 67\u201375.","DOI":"10.1109\/MM.2021.3085578"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00080"},{"key":"e_1_3_3_1_47_2","unstructured":"Zewen Ye Junhao Huang Tianshun Huang Yudan Bai Jinze Li Hao Zhang Guangyan Li Donglong Chen Ray\u00a0CC Cheung and Kejie Huang. 2025. PQNTRU: Acceleration of NTRU-based Schemes via Customized Post-Quantum Processor. IEEE Trans. Comput. (2025)."},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"crossref","unstructured":"Zewen Ye Ruibing Song Hao Zhang Donglong Chen Ray Chak-Chung Cheung and Kejie Huang. 2024. A Highly-efficient Lattice-based Post-Quantum Cryptography Processor for IoT Applications. IACR Transactions on Cryptographic Hardware and Embedded Systems 2024 2 (2024) 130\u2013153.","DOI":"10.46586\/tches.v2024.i2.130-153"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"crossref","unstructured":"Meng-Shiun Yu Hao-Chun Chang Chong-Teng Wang Yu-Wei Tien Tai-Liang Chen and Jenq-Kuen Lee. 2025. Optimizing computer vision algorithms with TVM on VLIW architecture based on RVV. The Journal of Supercomputing 81 1 (2025) 172.","DOI":"10.1007\/s11227-024-06530-x"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"crossref","unstructured":"Hongbin Zhang Mingjie Xing Yanjun Wu and Chen Zhao. 2023. Compiler Technologies in Deep Learning Co-Design: A Survey. Intelligent Computing 2 (2023) 0040.","DOI":"10.34133\/icomputing.0040"},{"key":"e_1_3_3_1_51_2","unstructured":"Chenggang Zhao Liang Zhao Jiashi Li and Zhean Xu. 2025. DeepGEMM: clean and efficient FP8 GEMM kernels with fine-grained scaling. https:\/\/github.com\/deepseek-ai\/DeepGEMM."},{"key":"e_1_3_3_1_52_2","volume-title":"The Saturn Microarchitecture Manual","author":"Zhao Jerry","year":"2024","unstructured":"Jerry Zhao, Daniel Grubb, Miles Rusch, Tianrui Wei, Kevin Anderson, Borivoje Nikolic, and Krste Asanovi\u0107. 2024. The Saturn Microarchitecture Manual. Technical Report UCB\/EECS-2024-215. EECS Department, University of California, Berkeley. http:\/\/www2.eecs.berkeley.edu\/Pubs\/TechRpts\/2024\/EECS-2024-215.html"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627915.3628090"},{"key":"e_1_3_3_1_54_2","unstructured":"Ali \u015eah \u00d6zcan and Erkay Sava\u015f. 2023. Two Algorithms for Fast GPU Implementation of NTT. Cryptology ePrint Archive Paper 2023\/1410. https:\/\/eprint.iacr.org\/2023\/1410 https:\/\/eprint.iacr.org\/2023\/1410."}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756059","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:48:35Z","timestamp":1769464115000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756059"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":53,"alternative-id":["10.1145\/3725843.3756059","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756059","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}