{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T08:00:45Z","timestamp":1776931245040,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","funder":[{"name":"European High-Performance Computing Joint Undertaking (JU)","award":["Grant Agreement No. 101202459 (DARE SGA1 Project)"],"award-info":[{"award-number":["Grant Agreement No. 101202459 (DARE SGA1 Project)"]}]},{"name":"MICIU\/AEI (Agencia Estatal de Investigaci\uc3b3n, Spain)","award":["Project No. PCI2024-161687-3 Project No. PID2023-146511NB-I00 Project No. PID2023-147979NB-C21"],"award-info":[{"award-number":["Project No. PCI2024-161687-3 Project No. PID2023-146511NB-I00 Project No. PID2023-147979NB-C21"]}]},{"name":"European Union NextGenerationEU \/PRTR","award":["Linked to PCI2024-161687-3"],"award-info":[{"award-number":["Linked to PCI2024-161687-3"]}]},{"name":"Spanish Ministry for Digital Transformation and Public Service","award":["REGAGE22e00058408992 (Recovery Transformation and Resilience Plan NextGenerationEU)"],"award-info":[{"award-number":["REGAGE22e00058408992 (Recovery Transformation and Resilience Plan NextGenerationEU)"]}]},{"DOI":"10.13039\/501100002809","name":"Generalitat de Catalunya","doi-asserted-by":"publisher","award":["Contract No. 2021-SGR-00763"],"award-info":[{"award-number":["Contract No. 2021-SGR-00763"]}],"id":[{"id":"10.13039\/501100002809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"LenovoBSC Framework Contract (2020)","award":[""],"award-info":[{"award-number":[""]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3760547","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:19:56Z","timestamp":1760721596000},"page":"1300-1315","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Empowering Vector Architectures for ML: The CAMP Architecture for Matrix Multiplication"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2635-5312","authenticated-orcid":false,"given":"Mohammadreza Esmali","family":"Nojehdeh","sequence":"first","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3286-635X","authenticated-orcid":false,"given":"Hossein","family":"Mokhtarnia","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain and Polytechnic University of Catalonia, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8291-509X","authenticated-orcid":false,"given":"Julian","family":"Pavon","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0237-5654","authenticated-orcid":false,"given":"Narc\u00eds","family":"Rodas","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2407-1228","authenticated-orcid":false,"given":"Roger Figueras","family":"Bagu\u00e9","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1385-7962","authenticated-orcid":false,"given":"Enrico","family":"Reggiani","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9848-8758","authenticated-orcid":false,"given":"Miquel","family":"Moreto","sequence":"additional","affiliation":[{"name":"Polytechnic University of Catalonia, Barcelona, Spain and Barcelona Supercomputing Center, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0544-9697","authenticated-orcid":false,"given":"Osman","family":"Unsal","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1277-9296","authenticated-orcid":false,"given":"Adri\u00e1n","family":"Cristal","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5146-103X","authenticated-orcid":false,"given":"Eduard","family":"Ayguad\u00e9","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain and Polytechnic University of Catalonia, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2023. AMD CDNA 3 Architecture. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/instinct-tech-docs\/white-papers\/amd-cdna-3-white-paper.pdf. [Web accessed 2025\/10\/03 12:58:08]."},{"key":"e_1_3_3_2_3_2","unstructured":"2023. NVIDIA Ampere GA102 GPU Architecture. https:\/\/www.nvidia.com\/content\/PDF\/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf. [Web accessed 2025\/10\/03 12:58:08]."},{"key":"e_1_3_3_2_4_2","unstructured":"2024. RISC-V GNU Compiler Toolchain. Online. Available: https:\/\/github.com\/riscv\/riscv-gnu-toolchain."},{"key":"e_1_3_3_2_5_2","unstructured":"Mart\u00edn Abadi Ashish Agarwal Paul Barham Eugene Brevdo Zhifeng Chen Craig Citro Greg\u00a0S Corrado Andy Davis Jeffrey Dean Matthieu Devin et\u00a0al. 2016. Tensorflow: Large-scale machine learning on heterogeneous distributed systems. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1603.04467 (2016)."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Guillermo Alaejos Adri\u00e1n Castell\u00f3 H\u00e9ctor Mart\u00ednez Pedro Alonso-Jord\u00e1 Francisco\u00a0D Igual and Enrique\u00a0S Quintana-Ort\u00ed. 2023. Micro-kernels for portable and efficient matrix multiplication in deep learning. The Journal of Supercomputing (2023).","DOI":"10.21203\/rs.3.rs-1909301\/v1"},{"key":"e_1_3_3_2_7_2","unstructured":"Alibaba Cloud. 2022. Alibaba Cloud Unveils New Server Chips to Optimize Cloud Computing Services. [Online]. Available: https:\/\/www.alibabacloud.com\/blog\/598159."},{"key":"e_1_3_3_2_8_2","unstructured":"Amazon Web Services. 2023. AWS Inferentia. https:\/\/aws.amazon.com\/machine-learning\/inferentia\/. [Web accessed 2025\/10\/03 12:58:08]]."},{"key":"e_1_3_3_2_9_2","unstructured":"Apple Inc.2023. Apple unveils M3 M3 Pro and M3 Max the most advanced chips for a personal computer. https:\/\/www.apple.com\/newsroom\/2023\/10\/apple-unveils-m3-m3-pro-and-m3-max-the-most-advanced-chips-for-a-personal-computer\/."},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/COOLCHIPS52128.2021.9410320"},{"key":"e_1_3_3_2_11_2","unstructured":"ARM. Year. ARM Performance Libraries (ARMPL). https:\/\/developer.arm.com\/tools-and-software\/server-and-hpc\/compile\/arm-compiler-for-linux\/arm-performance-libraries. Version x.x."},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Nathan Binkert Bradford Beckmann Gabriel Black Steven\u00a0K Reinhardt Ali Saidi Arkaprava Basu Joel Hestness Derek\u00a0R Hower Tushar Krishna Somayeh Sardashti et\u00a0al. 2011. The gem5 simulator. ACM SIGARCH computer architecture news (2011).","DOI":"10.1145\/2024716.2024718"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICFPT51103.2020.00011"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3387902.3394038"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Alessio Burrello Angelo Garofalo Nazareno Bruschi Giuseppe Tagliavini Davide Rossi and Francesco Conti. 2021. Dory: Automatic end-to-end deployment of real-world dnns on low-cost iot mcus. IEEE Trans. Comput. 70 8 (2021) 1253\u20131268.","DOI":"10.1109\/TC.2021.3066883"},{"key":"e_1_3_3_2_16_2","unstructured":"Cadence Design Systems Inc.[n. d.]. Innovus Implementation System. https:\/\/www.cadence.com\/en_US\/home\/tools\/digital-design-and-signoff\/physical-implementation\/innovus-implementation-system.html. Accessed: 2025\/10\/03 12:58:08."},{"key":"e_1_3_3_2_17_2","volume-title":"Proceedings of the 55th Annual IEEE\/ACM International Symposium on Microarchitecture(MICRO)","author":"Cai Xuyi","year":"2023","unstructured":"Xuyi Cai, Ying Wang, Xiaohan Ma, Yinhe Han, and Lei Zhang. 2023. DeepBurning-SEG: Generating DNN Accelerators of Segment-Grained Pipeline Architecture. In Proceedings of the 55th Annual IEEE\/ACM International Symposium on Microarchitecture(MICRO)."},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"crossref","unstructured":"Alessandro Capotondi Manuele Rusci Marco Fariselli and Luca Benini. 2020. CMix-NN: Mixed low-precision CNN library for memory-constrained edge devices. IEEE Transactions on Circuits and Systems II: Express Briefs 67 5 (2020) 871\u2013875.","DOI":"10.1109\/TCSII.2020.2983648"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/PDP55904.2022.00023"},{"key":"e_1_3_3_2_20_2","volume-title":"Tenth international workshop on frontiers in handwriting recognition","author":"Chellapilla Kumar","year":"2006","unstructured":"Kumar Chellapilla, Sidd Puri, and Patrice Simard. 2006. High performance convolutional neural networks for document processing. In Tenth international workshop on frontiers in handwriting recognition. Suvisoft."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.58"},{"key":"e_1_3_3_2_22_2","unstructured":"Intel Corporation. 2023. Intel DL Boost (VNNI). Low Precision Integer Operations. https:\/\/ai.intel.com\/intel-deep-learning-boost. [Web accessed 2025\/10\/03 12:58:08]."},{"key":"e_1_3_3_2_23_2","volume-title":"NVIDIA Blackwell Architecture Technical Overview","author":"Corporation NVIDIA","year":"2024","unstructured":"NVIDIA Corporation. 2024. NVIDIA Blackwell Architecture Technical Overview. https:\/\/resources.nvidia.com\/en-us-blackwell-architecture [Web, accessed 2025\/10\/03 12:58:08]."},{"key":"e_1_3_3_2_24_2","unstructured":"Pooya Davoodi Guangda Lai Trevor Morris and Siddharth Sharma. 2019. High performance inference with TensorRT Integration. TensorFlow Blog. https:\/\/blog.tensorflow.org\/2019\/06\/high-performance-inference-with-TensorRT.html."},{"key":"e_1_3_3_2_25_2","unstructured":"Marat Dukhan Yiming Wu and Hao Lu. 2018. QNNPACK: Open source library for optimized mobile deep learning."},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Angelo Garofalo Manuele Rusci Francesco Conti Davide Rossi and Luca Benini. 2020. PULP-NN: Accelerating quantized neural networks on parallel ultra-low-power RISC-V processors. Philosophical Transactions of the Royal Society A 378 2164 (2020) 20190155.","DOI":"10.1098\/rsta.2019.0155"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"crossref","unstructured":"Angelo Garofalo Giuseppe Tagliavini Francesco Conti Luca Benini and Davide Rossi. 2021. Xpulpnn: Enabling energy efficient and flexible inference of quantized neural networks on risc-v based iot end nodes. IEEE Transactions on Emerging Topics in Computing 9 3 (2021) 1489\u20131505.","DOI":"10.1109\/TETC.2021.3072337"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Michael Gautschi Pasquale\u00a0Davide Schiavone Andreas Traber Igor Loi Antonio Pullini Davide Rossi Eric Flamand Frank\u00a0K G\u00fcrkaynak and Luca Benini. 2017. Near-threshold RISC-V core with DSP extensions for scalable IoT endpoint devices. IEEE transactions on very large scale integration (VLSI) systems 25 10 (2017) 2700\u20132713.","DOI":"10.1109\/TVLSI.2017.2654506"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00038"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00069"},{"key":"e_1_3_3_2_31_2","unstructured":"Google Cloud. 2023. System Architecture: TPU VM. https:\/\/cloud.google.com\/tpu\/docs\/system-architecture-tpu-vm."},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","unstructured":"Kazushige Goto and Robert A. van\u00a0de Geijn. 2008. Anatomy of High-Performance Matrix Multiplication. ACM Trans. Math. Softw. (2008). 10.1145\/1356052.1356053","DOI":"10.1145\/1356052.1356053"},{"key":"e_1_3_3_2_33_2","unstructured":"Kazushige Goto and Robert Van De\u00a0Geijn. 2008. High-Performance Implementation of the Level-3 BLAS. ACM Trans. Math. Softw. (2008)."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00035"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358263"},{"key":"e_1_3_3_2_36_2","unstructured":"Intel Corporation. 2023. Intel Architecture Instruction Set Extensions and Future Features."},{"key":"e_1_3_3_2_37_2","volume-title":"Introduction to the Xe HPG Architecture","author":"Corporation Intel","year":"2023","unstructured":"Intel Corporation. 2023. Introduction to the Xe HPG Architecture. Technical Report. https:\/\/cdrdv2-public.intel.com\/758302\/introduction-to-the-xe-hpg-architecture-white-paper.pdf [Web, accessed 2025\/10\/03 12:58:08]."},{"key":"e_1_3_3_2_38_2","unstructured":"Benoit Jacob and Pete Warden. 2022. gemmlowp: A small self-contained low-precision GEMM library. https:\/\/github.com\/google\/gemmlowp."},{"key":"e_1_3_3_2_39_2","first-page":"293","volume-title":"Doklady Akademii Nauk","author":"Karatsuba Anatolii\u00a0Alekseevich","year":"1962","unstructured":"Anatolii\u00a0Alekseevich Karatsuba and Yu\u00a0P Ofman. 1962. Multiplication of many-digital numbers by automatic computers. In Doklady Akademii Nauk , Vol.\u00a0145. Russian Academy of Sciences, 293\u2013294."},{"key":"e_1_3_3_2_40_2","unstructured":"Byeongho Kim Sanghoon Cha Sangsoo Park Jieun Lee Sukhan Lee Shin-haeng Kang Jinin So Kyungsoo Kim Jin Jung Jong-Geon Lee et\u00a0al. 2024. The Breakthrough Memory Solutions for Improved Performance on LLM Inference. IEEE Micro (2024)."},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"crossref","unstructured":"C.\u00a0L. Lawson R.\u00a0J. Hanson D.\u00a0R. Kincaid and F.\u00a0T. Krogh. 1979. Basic Linear Algebra Subprograms for Fortran Usage. ACM Trans. Math. Softw. (1979).","DOI":"10.1145\/355841.355847"},{"key":"e_1_3_3_2_42_2","unstructured":"Yann LeCun Corinna Cortes Chris Burges et\u00a0al. 2010. MNIST handwritten digit database."},{"key":"e_1_3_3_2_43_2","unstructured":"Michael\u00a0C. Lehn. 2014. ulmBLAS: A high performance BLAS implementation. https:\/\/github.com\/michael-lehn\/ulmBLAS. Ulm University Version 1.0."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614297"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2017.7870353"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895613"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER49012.2020.00075"},{"key":"e_1_3_3_2_48_2","unstructured":"Ryohei Okazaki Takekazu Tabata Sota Sakashita Kenichi Kitamura Noriko Takagi Hideki Sakata Takeshi Ishibashi Takeo Nakamura and Yuichiro Ajima. 2020. Supercomputer Fugaku Cpu A64fx realizing high performance high-density packaging and low power consumption. Fujitsu Technical Review (2020)."},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISVLSI49217.2020.000-5"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00084"},{"key":"e_1_3_3_2_51_2","unstructured":"Jongsoo Park Maxim Naumov Protonu Basu Summer Deng Aravind Kalaiah Daya Khudia James Law Parth Malani Andrey Malevich Satish Nadathur et\u00a0al. 2018. Deep learning inference in facebook data centers: Characterization performance optimizations and hardware implications. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1811.09886 (2018)."},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614282"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507746"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071076"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/SCW63240.2024.00185"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"crossref","unstructured":"Fabian Schuiki Florian Zaruba Torsten Hoefler and Luca Benini. 2020. Stream semantic registers: A lightweight risc-v isa extension achieving full compute utilization in single-issue cores. IEEE Trans. Comput. 70 2 (2020) 212\u2013227.","DOI":"10.1109\/TC.2020.2987314"},{"key":"e_1_3_3_2_57_2","unstructured":"Ying Sheng Lianmin Zheng Binhang Yuan Zhuohan Li Max Ryabinin Daniel\u00a0Y Fu Zhiqiang Xie Beidi Chen Clark Barrett Joseph\u00a0E Gonzalez et\u00a0al. 2023. High-throughput generative inference of large language models with a single gpu. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.06865 (2023)."},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSD57027.2022.00042"},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"crossref","unstructured":"William\u00a0J. Starke Brian\u00a0W. Thompto Jeff\u00a0A. Stuecheli and Jos\u00e9\u00a0E. Moreira. 2021. IBM\u2019s POWER10 Processor. IEEE Micro (2021).","DOI":"10.1109\/MM.2021.3058632"},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614247"},{"key":"e_1_3_3_2_61_2","unstructured":"Synopsys. [n. d.]. Synopsys. https:\/\/www.synopsys.com\/. [Web accessed 2025\/10\/03 12:58:08]."},{"key":"e_1_3_3_2_62_2","unstructured":"Tesla Inc.2023. Tesla Dojo Technology. https:\/\/digitalassets.tesla.com\/tesla-contents\/image\/upload\/tesla-dojo-technology.pdf. [[Web accessed 2025\/10\/03 12:58:08]]."},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"crossref","unstructured":"Field\u00a0G Van\u00a0Zee and Tyler\u00a0M Smith. 2017. Implementing high-performance complex matrix multiplication via the 3m and 4m methods. ACM Transactions on Mathematical Software (TOMS) 44 1 (2017) 1\u201336.","DOI":"10.1145\/3086466"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"crossref","unstructured":"Field\u00a0G Van\u00a0Zee and Robert\u00a0A Van De\u00a0Geijn. 2015. BLIS: A framework for rapidly instantiating BLAS functionality. ACM Transactions on Mathematical Software (TOMS) (2015).","DOI":"10.1145\/2764454"},{"key":"e_1_3_3_2_65_2","unstructured":"Oriol Vinyals and Quoc Le. 2015. A neural conversational model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1506.05869 (2015)."},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"crossref","unstructured":"Hanchen Wang Tianfan Fu Yuanqi Du Wenhao Gao Kexin Huang Ziming Liu Payal Chandak Shengchao Liu Peter Van\u00a0Katwyk Andreea Deac et\u00a0al. 2023. Scientific discovery in the age of artificial intelligence. Nature (2023).","DOI":"10.1038\/s41586-023-06559-7"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"e_1_3_3_2_68_2","volume-title":"Introducing the scalable matrix extension for the Armv9-A architecture","author":"Weidmann Martin","year":"2021","unstructured":"Martin Weidmann. 2021. Introducing the scalable matrix extension for the Armv9-A architecture. Technical Report. Technical report."},{"key":"e_1_3_3_2_69_2","unstructured":"Zhang Xianyi Wang Qian and Zaheer Chothia. 2012. OpenBLAS. URL: http:\/\/xianyi. github. io\/OpenBLAS (2012)."},{"key":"e_1_3_3_2_70_2","unstructured":"Xilinx. [n. d.]. Alveo U250 Data Center Accelerator Card. https:\/\/www.xilinx.com\/products\/boards-and-kits\/alveo\/u250.html. [Web accessed 2025\/10\/03 12:58:08]."},{"key":"e_1_3_3_2_71_2","doi-asserted-by":"publisher","DOI":"10.1109\/AICAS57966.2023.10168586"}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3760547","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:47:33Z","timestamp":1769464053000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3760547"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":70,"alternative-id":["10.1145\/3725843.3760547","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3760547","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}