{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,14]],"date-time":"2026-06-14T05:53:01Z","timestamp":1781416381874,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":81,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nd\/4.0\/"}],"funder":[{"name":"HORIZON EUROPE Research and Innovation program","award":["101070375"],"award-info":[{"award-number":["101070375"]}]},{"name":"Research Foundation Flanders (FWO)","award":["1SE7723N"],"award-info":[{"award-number":["1SE7723N"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1145\/3696443.3708952","type":"proceedings-article","created":{"date-parts":[[2025,2,22]],"date-time":"2025-02-22T11:50:26Z","timestamp":1740225026000},"page":"163-178","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["A Multi-level Compiler Backend for Accelerated Micro-kernels Targeting RISC-V ISA Extensions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-9190-0301","authenticated-orcid":false,"given":"Alexandre","family":"Lopoukhine","sequence":"first","affiliation":[{"name":"University of Cambridge, Cambridge, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1447-8278","authenticated-orcid":false,"given":"Federico","family":"Ficarelli","sequence":"additional","affiliation":[{"name":"University of Bologna, Bologna, Italy"},{"name":"Cineca, Bologna, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7936-2183","authenticated-orcid":false,"given":"Christos","family":"Vasiladiotis","sequence":"additional","affiliation":[{"name":"University of Edinburgh, Edinburgh, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9389-8512","authenticated-orcid":false,"given":"Anton","family":"Lydike","sequence":"additional","affiliation":[{"name":"University of Edinburgh, Edinburgh, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9503-403X","authenticated-orcid":false,"given":"Josse","family":"Van Delm","sequence":"additional","affiliation":[{"name":"KU Leuven, Leuven, Belgium"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7978-0608","authenticated-orcid":false,"given":"Alban","family":"Dutilleul","sequence":"additional","affiliation":[{"name":"ENS Rennes, Rennes, France"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8068-3806","authenticated-orcid":false,"given":"Luca","family":"Benini","sequence":"additional","affiliation":[{"name":"ETH Zurich, Zurich, Switzerland"},{"name":"University of Bologna, Bologna, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3495-9263","authenticated-orcid":false,"given":"Marian","family":"Verhelst","sequence":"additional","affiliation":[{"name":"KU Leuven, Leuven, Belgium"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3874-6003","authenticated-orcid":false,"given":"Tobias","family":"Grosser","sequence":"additional","affiliation":[{"name":"University of Cambridge, Cambridge, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,3]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. Cranelift Code Generator. https:\/\/github.com\/bytecodealliance\/wasmtime\/tree\/main\/cranelift"},{"key":"e_1_3_2_1_2_1","unstructured":"2024. The European Processor Initiative Accelerator Processor Stream. https:\/\/www.european-processor-initiative.eu\/accelerator\/"},{"key":"e_1_3_2_1_3_1","unstructured":"2024. GCC the GNU Compiler Collection - GNU Project. https:\/\/gcc.gnu.org\/"},{"key":"e_1_3_2_1_4_1","unstructured":"2024. High performance RISC-V CPUs. https:\/\/www.ventanamicro.com\/technology\/risc-v-cpu-ip\/"},{"key":"e_1_3_2_1_5_1","unstructured":"2024. Intel oneDNN. https:\/\/github.com\/oneapi-src\/oneDNN"},{"key":"e_1_3_2_1_6_1","unstructured":"2024. LLVM for PULP Platform Projects Snitch RISC-V ISA Extension Support PULP Project. https:\/\/github.com\/pulp-platform\/llvm-project\/tree\/d2f0eff9be1f58bb186499e2055eb6888ce88dcc"},{"key":"e_1_3_2_1_7_1","unstructured":"2024. MLIR Documentation: linalg Dialect. https:\/\/mlir.llvm.org\/docs\/Dialects\/Linalg"},{"key":"e_1_3_2_1_8_1","unstructured":"2024. NVIDIA cuDNN. https:\/\/developer.nvidia.com\/cudnn"},{"key":"e_1_3_2_1_9_1","unstructured":"2024. oneAPI Programming Model. https:\/\/www.oneapi.io\/"},{"key":"e_1_3_2_1_10_1","unstructured":"2024. RISC-V Packed SIMD Extension. https:\/\/github.com\/riscv\/riscv-p-spec"},{"key":"e_1_3_2_1_11_1","unstructured":"2024. Snitch Cluster PULP Project. https:\/\/github.com\/pulp-platform\/snitch_cluster\/tree\/772b86ae84ec0d5a6f1e755cb524ba0aae2cefc3"},{"key":"e_1_3_2_1_12_1","unstructured":"2024. Torch-MLIR. https:\/\/github.com\/llvm\/torch-mlir"},{"key":"e_1_3_2_1_13_1","unstructured":"2024. Verilator. https:\/\/www.veripool.org\/wiki\/verilator"},{"key":"e_1_3_2_1_14_1","unstructured":"2024. xDSL: A Python Compiler Design Toolkit. https:\/\/github.com\/xdslproject\/xdsl"},{"key":"e_1_3_2_1_15_1","unstructured":"2024. XuanTie RISC-V ISA Extensions Specification. https:\/\/github.com\/XUANTIE-RV\/thead-extension-spec"},{"key":"e_1_3_2_1_16_1","volume-title":"TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Abadi Martin","year":"2016","unstructured":"Martin Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16). 265\u2013283. isbn:978-1-931971-33-1 https:\/\/www.usenix.org\/conference\/osdi16\/technical-sessions\/presentation\/abadi"},{"key":"e_1_3_2_1_17_1","volume-title":"Optimizing Compilers for Modern Architectures: A Dependence-based Approach (1 ed.). Morgan Kaufmann","author":"Allen Randy","unstructured":"Randy Allen and Ken Kennedy. 2001. Optimizing Compilers for Modern Architectures: A Dependence-based Approach (1 ed.). Morgan Kaufmann, San Francisco. isbn:978-1-55860-286-1"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.5555\/3314872.3314896"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651344"},{"key":"e_1_3_2_1_20_1","volume-title":"March, arxiv:2003.00532 arXiv","author":"Bondhugula Uday","year":"2003","unstructured":"Uday Bondhugula. 2020. High Performance Code Generation in MLIR: An Early Case Study with GEMM. arXiv:2003.00532 [cs], March, arxiv:2003.00532 arXiv: 2003.00532"},{"key":"e_1_3_2_1_21_1","unstructured":"Florent Bouchez Alain Darte Christophe Guillon and Fabrice Rastello. 2005. Register Allocation and Spill Complexity under SSA. Laboratoire de l\u2019informatique du parall\u00e9lisme 2+28p.. https:\/\/hal-lara.archives-ouvertes.fr\/hal-02102197"},{"key":"e_1_3_2_1_22_1","volume-title":"SSA-based Compiler Design","year":"2022","unstructured":"2023. SSA-based Compiler Design (1st ed. 2022 ed.), Florent Bouchez Tichadou and Fabrice Rastello (Eds.). Springer Nature Switzerland AG, Cham. isbn:978-3-030-80514-2","edition":"1"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-60276-5_8"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO57630.2024.10444883"},{"key":"e_1_3_2_1_25_1","first-page":"47","volume-title":"Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation (OSDI\u201918)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Meghan Cowan, Haichen Shen, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation (OSDI\u201918). USENIX Association, USA. 579\u2013594. isbn:978-1-931971-47-8"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","unstructured":"Tianqi Chen Lianmin Zheng Eddie Yan Ziheng Jiang Thierry Moreau Luis Ceze Carlos Guestrin and Arvind Krishnamurthy. 2019. Learning to Optimize Tensor Programs. https:\/\/doi.org\/10.48550\/arXiv.1805.08166 arxiv:1805.08166. 10.48550\/arXiv.1805.08166","DOI":"10.48550\/arXiv.1805.08166"},{"key":"e_1_3_2_1_27_1","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cuDNN: Efficient Primitives for Deep Learning. arXiv:1410.0759 [cs] Dec. arxiv:1410.0759. arxiv:1410.0759"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/C2014-0-01395-0"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2022.3140674"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/77626.79170"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","unstructured":"Mathieu Fehr Michel Weber Christian Ulmann Alexandre Lopoukhine Martin L\u00fccke Th\u00e9o Degioanni Michel Steuwer and Tobias Grosser. 2023. Sidekick Compilation with xDSL. https:\/\/doi.org\/10.48550\/arXiv.2311.07422 arxiv:2311.07422. 10.48550\/arXiv.2311.07422","DOI":"10.48550\/arXiv.2311.07422"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICECS46596.2019.8965067"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2017.2654506"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2017.2654506"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3559009.3569682"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/1356052.1356053"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1142\/S0129626412500107"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3469030"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/11688839_20"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414632"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.83"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3282307"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3519939.3523446"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/2544137.2544161"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2008.08272"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3497776.3517770"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3133901"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3192366.3192379"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/2464996.2465012"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1002\/spe.3214"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2004.1281665"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370308"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446759"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2022.3178068"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.5281\/zenodo.14052014"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446712"},{"key":"e_1_3_2_1_58_1","unstructured":"Jeff Niu and Mehdi Amini. 2023. MLIR Dialect Design and Composition for Front-End Compilers. https:\/\/llvm.org\/devmtg\/2023-05\/"},{"key":"e_1_3_2_1_59_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems. 32, Curran Associates, Inc.. isbn:978-1-71380-793-3 https:\/\/proceedings.neurips.cc\/paper\/2019\/hash\/bdbca288fee7f92f2bfa9f7012727740-Abstract.html"},{"key":"e_1_3_2_1_60_1","unstructured":"Dylan Patel. 2021. Tenstorrent Wormhole Analysis - A Scale Out Architecture for Machine Learning That Could Put Nvidia On Their Back Foot. https:\/\/www.semianalysis.com\/p\/tenstorrent-wormhole-analysis-a-scale"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISVLSI54635.2022.00021"},{"key":"e_1_3_2_1_62_1","unstructured":"Fernando Magno Quint\u00e3o Pereira. 2007. The Design and Implementation of a SSA-based Register Allocator. https:\/\/www.semanticscholar.org\/paper\/The-Design-and-Implementation-of-a-SSA-based-Pereira\/9266bd5e1102892dcbe6e38907abc119bb4e761f"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2019.2912307"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2020.2987314"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2017.7863730"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.23919\/DATE.2019.8714897"},{"key":"e_1_3_2_1_68_1","unstructured":"Jim M. R. Teichgr\u00e4ber. 2023. Efficient Compilation of an Extensible Intermediate Representation. Technische Universit\u00e4t M\u00fcnchen. https:\/\/github.com\/J-MR-T\/MoNaCo"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3570641"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/2755561"},{"key":"e_1_3_2_1_72_1","unstructured":"Steven Varoumas. 2023. Using MLIR to Optimize Basic Linear Algebraic Subprograms. https:\/\/llvm.org\/devmtg\/2023-05\/slides\/TechnicalTalks-May10\/08-Varoumas-UsingMLIR-to-OptimizeBasicLinearAlgebraicSubprograms.pdf"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","unstructured":"Nicolas Vasilache Oleksandr Zinenko Aart J. C. Bik Mahesh Ravishankar Thomas Raoux Alexander Belyaev Matthias Springer Tobias Gysi Diego Caballero Stephan Herhut Stella Laurenzo and Albert Cohen. 2022. Composable and Modular Code Generation in MLIR: A Structured and Retargetable Approach to Tensor Compiler Construction. https:\/\/doi.org\/10.48550\/arXiv.2202.03293 arxiv:2202.03293. 10.48550\/arXiv.2202.03293","DOI":"10.48550\/arXiv.2202.03293"},{"key":"e_1_3_2_1_74_1","unstructured":"Jonatan Waern Per Ekemark Konstantinos Koukos Stefanos Kaxiras and Alexandra Jimborean. 2016. Profiling-Assisted Decoupled Access-Execute. arxiv:1601.01722 arXiv:1601.01722 [cs]"},{"key":"e_1_3_2_1_75_1","unstructured":"Andrew Waterman and Krste Asanovi\u0107. 2019. The RISC-V Instruction Set Manual Volume 1: User-Level ISA (document version 20191213 ed.). Available at https:\/\/riscv.org\/technical\/specifications"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/1772954.1772979"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/SCCC.2012.29"},{"key":"e_1_3_2_1_78_1","unstructured":"Florian Zaruba. 2023. Harnessing the RISC-V Wave: The Future is Now. https:\/\/www.axelera.ai\/harnessing-the-risc-v-wave-the-future-is-now\/"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2020.3045564"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2020.3027900"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1145\/3276491"}],"event":{"name":"CGO '25: 23rd ACM\/IEEE International Symposium on Code Generation and Optimization","location":"Las Vegas NV USA","acronym":"CGO '25","sponsor":["SIGPLAN SIGPLAN Programming Languages","SIGMICRO SIGMICRO Microarchitecture","IEEE Computer Society IEEE Computer Society"]},"container-title":["Proceedings of the 23rd ACM\/IEEE International Symposium on Code Generation and Optimization"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696443.3708952","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:13Z","timestamp":1750295413000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696443.3708952"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3]]},"references-count":81,"alternative-id":["10.1145\/3696443.3708952","10.1145\/3696443"],"URL":"https:\/\/doi.org\/10.1145\/3696443.3708952","relation":{},"subject":[],"published":{"date-parts":[[2025,3]]},"assertion":[{"value":"2025-03-01","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}