{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T23:20:07Z","timestamp":1780356007653,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":108,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756045","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:21:19Z","timestamp":1760721679000},"page":"1363-1380","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["PyTorchSim: A Comprehensive, Fast, and Accurate NPU Simulation Framework"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-1918-8445","authenticated-orcid":false,"given":"Wonhyuk","family":"Yang","sequence":"first","affiliation":[{"name":"POSTECH, Pohang, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6738-3288","authenticated-orcid":false,"given":"Yunseon","family":"Shin","sequence":"additional","affiliation":[{"name":"POSTECH, Pohang, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6821-1594","authenticated-orcid":false,"given":"Okkyun","family":"Woo","sequence":"additional","affiliation":[{"name":"POSTECH, Pohang, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0248-3439","authenticated-orcid":false,"given":"Geonwoo","family":"Park","sequence":"additional","affiliation":[{"name":"Samsung Electronics, Suwon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4989-2834","authenticated-orcid":false,"given":"Hyungkyu","family":"Ham","sequence":"additional","affiliation":[{"name":"POSTECH, Pohang, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2115-0871","authenticated-orcid":false,"given":"Jeehoon","family":"Kang","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea and FuriosaAI, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6629-449X","authenticated-orcid":false,"given":"Jongse","family":"Park","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5749-5794","authenticated-orcid":false,"given":"Gwangsun","family":"Kim","sequence":"additional","affiliation":[{"name":"POSTECH, Pohang, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2019. Spike RISC-V ISA Simulator. GitHub repository. https:\/\/github.com\/riscv-software-src\/riscv-isa-sim."},{"key":"e_1_3_3_2_3_2","unstructured":"2021. RISC-V \"V\" Vector Extension Version 1.0. RISC-V International. https:\/\/github.com\/riscvarchive\/riscv-v-spec\/releases\/download\/v1.0\/riscv-v-spec-1.0.pdf"},{"key":"e_1_3_3_2_4_2","unstructured":"2022. NVIDIA H100 Tensor Core GPU Architecture. https:\/\/resources.nvidia.com\/en-us-data-center-overview\/gtc22-whitepaper-hopper."},{"key":"e_1_3_3_2_5_2","unstructured":"2023. OpenXLA Project. Google. https:\/\/openxla.org\/xla"},{"key":"e_1_3_3_2_6_2","unstructured":"2023. SST-STONNE simulator. [Online]. Available: https:\/\/github.com\/stonne-simulator\/sst-elements-with-stonne."},{"key":"e_1_3_3_2_7_2","unstructured":"2024. How the PolyBlocks AI Compiler Works. PolyMage Labs. [Online]. Available: https:\/\/docs.polymagelabs.com\/articles\/polyblocks-quantization.html."},{"key":"e_1_3_3_2_8_2","unstructured":"2024. Inferentia Architecture. AWS Neuron Documentation. https:\/\/awsdocs-neuron.readthedocs-hosted.com\/en\/latest\/general\/arch\/neuron-hardware\/inferentia.html"},{"key":"e_1_3_3_2_9_2","unstructured":"2024. Introduction to ONNX. ONNX 0.19.0 documentation. [Online]. Available: https:\/\/onnx.ai\/onnx\/intro\/."},{"key":"e_1_3_3_2_10_2","unstructured":"2024. Meeting the challenge of concurrent RTL and workload verification and validation. SIEMENS whitepaper."},{"key":"e_1_3_3_2_11_2","unstructured":"2024. Optimize TensorFlow performance using the Profiler. Google. [Online]. Available: https:\/\/www.tensorflow.org\/guide\/profiler."},{"key":"e_1_3_3_2_12_2","unstructured":"2024. PyTorch Documentation (2.4) - CUDA semantics (Memory Management). https:\/\/pytorch.org\/docs\/stable\/notes\/cuda.html#cuda-memory-management"},{"key":"e_1_3_3_2_13_2","unstructured":"2024. PyTorch Documentation (2.4) - torch.compile. https:\/\/pytorch.org\/docs\/stable\/generated\/torch.compile.html"},{"key":"e_1_3_3_2_14_2","unstructured":"2024. RISC-V Compiler Infrastructure and Toolchain. https:\/\/github.com\/riscv-software-src. [Accessed 21-06-2025]."},{"key":"e_1_3_3_2_15_2","unstructured":"2024. SiFive Vector Coprocessor Interface (VCIX) Software Specification Version 1.1. [Online]. Available: https:\/\/www.sifive.com\/document-file\/sifive-vector-coprocessor-interface-vcix-software."},{"key":"e_1_3_3_2_16_2","unstructured":"2024. TensorBoard: TensorFlow\u2019s visualization toolkit. Google. [Online]. Available: https:\/\/www.tensorflow.org\/tensorboard."},{"key":"e_1_3_3_2_17_2","unstructured":"2024. Trainium Architecture. AWS Neuron Documentation. https:\/\/awsdocs-neuron.readthedocs-hosted.com\/en\/latest\/general\/arch\/neuron-hardware\/trainium.html"},{"key":"e_1_3_3_2_18_2","unstructured":"2025. Apple Watch Series 10. [Online]. Available: https:\/\/www.apple.com\/apple-watch-series-10\/specs\/."},{"key":"e_1_3_3_2_19_2","volume-title":"CUDA C++ Programming Guide","year":"2025","unstructured":"2025. CUDA C++ Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/ Section on PTX and SASS."},{"key":"e_1_3_3_2_20_2","unstructured":"2025. Efficient GEMM in CUDA. NVIDIA CUTLASS Documentation. https:\/\/docs.nvidia.com\/cutlass\/media\/docs\/cpp\/efficient_gemm.html. [Accessed 20-06-2025]."},{"key":"e_1_3_3_2_21_2","unstructured":"2025. Gemmini. GitHub repository. https:\/\/github.com\/ucb-bar\/gemmini."},{"key":"e_1_3_3_2_22_2","unstructured":"2025. Papers With Code : Trends. Meta AI. https:\/\/paperswithcode.com\/trends. [Accessed 16 June 2025]."},{"key":"e_1_3_3_2_23_2","unstructured":"2025. PyTorch Documentation \u2013 torch.sparse.mm. The Linux Foundation. [Online]. Available: https:\/\/pytorch.org\/docs\/stable\/generated\/torch.sparse.mm.html."},{"key":"e_1_3_3_2_24_2","unstructured":"2025. PyTorch2 AsyncCompile. [Online]. Available: https:\/\/github.com\/pytorch\/pytorch\/blob\/main\/torch\/_inductor\/async_compile.py."},{"key":"e_1_3_3_2_25_2","unstructured":"2025. \"RDNA4\" Instruction Set Architecture Reference Guide. AMD. [Online]. Available: https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/radeon-tech-docs\/instruction-set-architectures\/rdna4-instruction-set-architecture.pdf."},{"key":"e_1_3_3_2_26_2","first-page":"265","volume-title":"12th USENIX symposium on operating systems design and implementation (OSDI 16)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, et\u00a0al. 2016. TensorFlow: a system for Large-Scale machine learning. In 12th USENIX symposium on operating systems design and implementation (OSDI 16). 265\u2013283."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527405"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00023"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-53282-5"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2013.6557148"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD.2016.7753351"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640366"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"crossref","unstructured":"Todd Austin Eric Larson and Dan Ernst. 2002. SimpleScalar: An infrastructure for computer system modeling. Computer 35 2 (2002) 59\u201367.","DOI":"10.1109\/2.982917"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00021"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3559009.3569666"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"crossref","unstructured":"Nathan Binkert Bradford Beckmann Gabriel Black Steven\u00a0K Reinhardt Ali Saidi Arkaprava Basu Joel Hestness Derek\u00a0R Hower Tushar Krishna Somayeh Sardashti et\u00a0al. 2011. The gem5 simulator. ACM SIGARCH computer architecture news 39 2 (2011) 1\u20137.","DOI":"10.1145\/2024716.2024718"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","unstructured":"Nathan Binkert Bradford Beckmann Gabriel Black Steven\u00a0K. Reinhardt Ali Saidi Arkaprava Basu Joel Hestness Derek\u00a0R. Hower Tushar Krishna Somayeh Sardashti Rathijit Sen Korey Sewell Muhammad Shoaib Nilay Vaish Mark\u00a0D. Hill and David\u00a0A. Wood. 2011. The gem5 simulator. SIGARCH Comput. Archit. News 39 2 (aug 2011) 1\u20137. 10.1145\/2024716.2024718","DOI":"10.1145\/2024716.2024718"},{"key":"e_1_3_3_2_39_2","volume-title":"JAX: composable transformations of Python+NumPy programs","author":"Bradbury James","year":"2018","unstructured":"James Bradbury, Roy Frostig, Peter Hawkins, Matthew\u00a0James Johnson, Chris Leary, Dougal Maclaurin, George Necula, Adam Paszke, Jake VanderPlas, Skye Wanderman-Milne, and Qiao Zhang. 2018. JAX: composable transformations of Python+NumPy programs. http:\/\/github.com\/jax-ml\/jax"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00022"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","unstructured":"Trevor\u00a0E. Carlson Wim Heirman Stijn Eyerman Ibrahim Hur and Lieven Eeckhout. 2014. An Evaluation of High-Level Mechanistic Core Models. ACM Trans. Archit. Code Optim. 11 3 Article 28 (Aug. 2014) 25\u00a0pages. 10.1145\/2629677","DOI":"10.1145\/2629677"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","unstructured":"Trevor\u00a0E. Carlson Wim Heirman Stijn Eyerman Ibrahim Hur and Lieven Eeckhout. 2014. An Evaluation of High-Level Mechanistic Core Models. ACM Trans. Archit. Code Optim. 11 3 Article 28 (Aug. 2014) 25\u00a0pages. 10.1145\/2629677","DOI":"10.1145\/2629677"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/CODES-ISSS.2013.6659023"},{"key":"e_1_3_3_2_44_2","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). USENIX Association, Carlsbad, CA, 578\u2013594. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/chen"},{"key":"e_1_3_3_2_45_2","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cuDNN: Efficient Primitives for Deep Learning. CoRR abs\/1410.0759 (2014). arXiv:https:\/\/arXiv.org\/abs\/1410.0759http:\/\/arxiv.org\/abs\/1410.0759"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC63097.2024.00012"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","unstructured":"L. Dagum and R. Menon. 1998. OpenMP: an industry standard API for shared-memory programming. IEEE Computational Science and Engineering 5 1 (1998) 46\u201355. 10.1109\/99.660313","DOI":"10.1109\/99.660313"},{"key":"e_1_3_3_2_48_2","unstructured":"Tri Dao Dan Fu Stefano Ermon Atri Rudra and Christopher R\u00e9. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. Advances in neural information processing systems 35 (2022) 16344\u201316359."},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"crossref","unstructured":"Li Deng. 2012. The mnist database of handwritten digit images for machine learning research. IEEE Signal Processing Magazine 29 6 (2012) 141\u2013142.","DOI":"10.1109\/MSP.2012.2211477"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"e_1_3_3_2_51_2","unstructured":"Dave Ditzel. 2024. Why Esperanto picked RISC-V for HPC Computing. International workshop on RISC-V for HPC (RISCV-HPC) at SC24. [Online]. Available: https:\/\/github.com\/RISCVtestbed\/riscvtestbed.github.io\/blob\/main\/assets\/files\/sc24\/Ditzel.pdf."},{"key":"e_1_3_3_2_52_2","unstructured":"Roger Espasa. 2024. SemiDynamics: Out-of-Order RISC-V Cores Now with Vector!https:\/\/riscv-europe.org\/summit\/2024\/media\/proceedings\/plenary\/Tue-09-40-Roger-Espasa.pdf. Plenary Talk RISC-V Summit Europe 2024."},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589348"},{"key":"e_1_3_3_2_54_2","unstructured":"gem5 Project. 2021. gem5 21.2 Release. https:\/\/www.gem5.org\/project\/2021\/12\/28\/gem5-21-2.html. [Accessed 21-06-2025]."},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586216"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640365"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC59245.2023.00018"},{"key":"e_1_3_3_2_59_2","unstructured":"Intel Corporation. 2020. Intel\u00ae Advanced Vector Extensions 512 (Intel\u00ae AVX-512). https:\/\/www.intel.com\/content\/www\/us\/en\/architecture-and-technology\/avx-512-overview.html. [Accessed 21-06-2025]."},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2013.6557149"},{"key":"e_1_3_3_2_61_2","series-title":"Proceedings of Machine Learning Research","first-page":"4911","volume-title":"Proceedings of the 37th International Conference on Machine Learning","volume":"119","author":"Johnson Tyler","year":"2020","unstructured":"Tyler Johnson, Pulkit Agrawal, Haijie Gu, and Carlos Guestrin. 2020. AdaScale SGD: A User-Friendly Algorithm for Distributed Training. In Proceedings of the 37th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0119), Hal\u00a0Daum\u00e9 III and Aarti Singh (Eds.). PMLR, 4911\u20134920. https:\/\/proceedings.mlr.press\/v119\/johnson20a.html"},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00010"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454038"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"e_1_3_3_2_67_2","unstructured":"Jehandad Khan Paul Fultz Artem Tamazov Daniel Lowell Chao Liu Michael Melesse Murali Nandhimandalam Kamil Nasyrov Ilya Perminov Tejash Shah Vasilii Filippov Jing Zhang Jing Zhou Bragadeesh Natarajan and Mayank Daga. 2019. MIOpen: An Open Source Library For Deep Learning Primitives. CoRR abs\/1910.00078 (2019). arXiv:https:\/\/arXiv.org\/abs\/1910.00078http:\/\/arxiv.org\/abs\/1910.00078"},{"key":"e_1_3_3_2_68_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00069"},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"publisher","DOI":"10.1145\/3623278.3624753"},{"key":"e_1_3_3_2_70_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358252"},{"key":"e_1_3_3_2_71_2","unstructured":"William Lacy Gregory\u00a0Michael Thorson Christopher\u00a0Aaron Clark Norman\u00a0Paul Jouppi Thomas Norrie and Andrew\u00a0Everett Phelps. 2022. Vector Processing Unit. U.S Patent 11520581."},{"key":"e_1_3_3_2_72_2","unstructured":"Zhenzhong Lan Mingda Chen Sebastian Goodman Kevin Gimpel Piyush Sharma and Radu Soricut. 2020. ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. arxiv:https:\/\/arXiv.org\/abs\/1909.11942\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1909.11942"},{"key":"e_1_3_3_2_73_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2004.1281665"},{"key":"e_1_3_3_2_74_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370308"},{"key":"e_1_3_3_2_75_2","doi-asserted-by":"publisher","unstructured":"Sean Lie. 2023. Cerebras Architecture Deep Dive: First Look Inside the Hardware\/Software Co-Design for Deep Learning. IEEE Micro 43 3 (2023) 18\u201330. 10.1109\/MM.2023.3256384","DOI":"10.1109\/MM.2023.3256384"},{"key":"e_1_3_3_2_76_2","unstructured":"Jason Lowe-Power. 2023. gem5 Version 23.1 Release. gem5 blog. https:\/\/www.gem5.org\/project\/2023\/12\/21\/gem5-23-1.html"},{"key":"e_1_3_3_2_77_2","unstructured":"Jason Lowe-Power Abdul\u00a0Mutaal Ahmad Ayaz Akram Mohammad Alian Rico Amslinger Matteo Andreozzi Adri\u00e0 Armejach Nils Asmussen Srikant Bharadwaj Gabe Black Gedare Bloom Bobby\u00a0R. Bruce Daniel\u00a0Rodrigues Carvalho Jer\u00f3nimo Castrill\u00f3n Lizhong Chen Nicolas Derumigny Stephan Diestelhorst Wendy Elsasser Marjan Fariborz Amin\u00a0Farmahini Farahani Pouya Fotouhi Ryan Gambord Jayneel Gandhi Dibakar Gope Thomas Grass Bagus Hanindhito Andreas Hansson Swapnil Haria Austin Harris Timothy Hayes Adrian Herrera Matthew Horsnell Syed Ali\u00a0Raza Jafri Radhika Jagtap Hanhwi Jang Reiley Jeyapaul Timothy\u00a0M. Jones Matthias Jung Subash Kannoth Hamidreza Khaleghzadeh Yuetsu Kodama Tushar Krishna Tommaso Marinelli Christian Menard Andrea Mondelli Tiago M\u00fcck Omar Naji Krishnendra Nathella Hoa Nguyen Nikos Nikoleris Lena\u00a0E. Olson Marc\u00a0S. Orr Binh Pham Pablo Prieto Trivikram Reddy Alec Roelke Mahyar Samani Andreas Sandberg Javier Setoain Boris Shingarov Matthew\u00a0D. Sinclair Tuan Ta Rahul Thakur Giacomo Travaglini Michael Upton Nilay Vaish Ilias Vougioukas Zhengrong Wang Norbert Wehn Christian Weis David\u00a0A. Wood Hongil Yoon and \u00c9der\u00a0F. Zulian. 2020. The gem5 Simulator: Version 20.0+. CoRR abs\/2007.03152 (2020). arXiv:https:\/\/arXiv.org\/abs\/2007.03152https:\/\/arxiv.org\/abs\/2007.03152"},{"key":"e_1_3_3_2_78_2","doi-asserted-by":"crossref","unstructured":"Chi-Keung Luk Robert Cohn Robert Muth Harish Patil Artur Klauser Geoff Lowney Steven Wallace Vijay\u00a0Janapa Reddi and Kim Hazelwood. 2005. Pin: building customized program analysis tools with dynamic instrumentation. Acm sigplan notices 40 6 (2005) 190\u2013200.","DOI":"10.1145\/1064978.1065034"},{"key":"e_1_3_3_2_79_2","doi-asserted-by":"publisher","unstructured":"Haocong Luo Yahya\u00a0Can Tu\u011frul F.\u00a0Nisa Bostanc\u0131 Ataberk Olgun A.\u00a0Giray Ya\u011fl\u0131k\u00e7\u0131 and Onur Mutlu. 2023. Ramulator 2.0: A Modern Modular and Extensible DRAM Simulator. IEEE Comput. Archit. Lett. 23 1 (Nov. 2023) 112\u2013116. 10.1109\/LCA.2023.3333759","DOI":"10.1109\/LCA.2023.3333759"},{"key":"e_1_3_3_2_80_2","doi-asserted-by":"publisher","unstructured":"Sheng Ma Yuanwu Lei Libo Huang and Zhiying Wang. 2019. MT-DMA: A DMA Controller Supporting Efficient Matrix Transposition for Digital Signal Processing. IEEE Access 7 (2019) 5808\u20135818. 10.1109\/ACCESS.2018.2889558","DOI":"10.1109\/ACCESS.2018.2889558"},{"key":"e_1_3_3_2_81_2","doi-asserted-by":"crossref","unstructured":"Milo\u00a0MK Martin Daniel\u00a0J Sorin Bradford\u00a0M Beckmann Michael\u00a0R Marty Min Xu Alaa\u00a0R Alameldeen Kevin\u00a0E Moore Mark\u00a0D Hill and David\u00a0A Wood. 2005. Multifacet\u2019s general execution-driven multiprocessor simulator (GEMS) toolset. ACM SIGARCH Computer Architecture News 33 4 (2005) 92\u201399.","DOI":"10.1145\/1105734.1105747"},{"key":"e_1_3_3_2_82_2","unstructured":"MLCommons. 2025. Reference implementations of MLPerf\u2122 Inference benchmarks (v5.1). https:\/\/github.com\/mlcommons\/inference. [Accessed 16 June 2025]."},{"key":"e_1_3_3_2_83_2","unstructured":"MLCommons. 2025. Reference implementations of MLPerf\u2122 Training benchmarks (v5.0). https:\/\/github.com\/mlcommons\/training. [Accessed 16 June 2025]."},{"key":"e_1_3_3_2_84_2","unstructured":"Rolf Morel. 2025. MLIR Tensor Compiler: Design Group and Charter Update. https:\/\/llvm.org\/devmtg\/2025-04\/slides\/quick_talk\/morel_mlir_tensor.pdf. LLVM Developers\u2019 Meeting April 2025. Quick Talk.."},{"key":"e_1_3_3_2_85_2","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582069"},{"key":"e_1_3_3_2_86_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC53511.2021.00028"},{"key":"e_1_3_3_2_87_2","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"e_1_3_3_2_88_2","doi-asserted-by":"publisher","unstructured":"Thomas Norrie Nishant Patil Doe\u00a0Hyun Yoon George Kurian Sheng Li James Laudon Cliff Young Norman Jouppi and David Patterson. 2021. The Design Process for Google\u2019s Training Chips: TPUv2 and TPUv3. IEEE Micro 41 2 (2021) 56\u201363. 10.1109\/MM.2021.3058217","DOI":"10.1109\/MM.2021.3058217"},{"key":"e_1_3_3_2_89_2","unstructured":"Ryan O\u2019Connor. [n. d.]. PyTorch vs TensorFlow in 2023. AssemblyAI Blog. https:\/\/www.assemblyai.com\/blog\/pytorch-vs-tensorflow-in-2023\/"},{"key":"e_1_3_3_2_90_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00042"},{"key":"e_1_3_3_2_91_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00100"},{"key":"e_1_3_3_2_92_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS64960.2025.00026"},{"key":"e_1_3_3_2_93_2","volume-title":"Proceedings of the Fifth Conference on Machine Learning and Systems, MLSys 2022, Santa Clara, CA, USA, August 29 - September 1, 2022","author":"Reed James\u00a0K.","year":"2022","unstructured":"James\u00a0K. Reed, Zachary DeVito, Horace He, Ansley Ussery, and Jason Ansel. 2022. torch.fx: Practical Program Capture and Transformation for Deep Learning in Python. In Proceedings of the Fifth Conference on Machine Learning and Systems, MLSys 2022, Santa Clara, CA, USA, August 29 - September 1, 2022."},{"key":"e_1_3_3_2_94_2","unstructured":"RISC-V International. 2024. RISC-V Ecosystem Landscape. https:\/\/landscape.riscv.org\/. [Accessed 21-06-2025]."},{"key":"e_1_3_3_2_95_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS48437.2020.00016"},{"key":"e_1_3_3_2_96_2","doi-asserted-by":"publisher","unstructured":"Wonik Seo Sanghoon Cha Yeonjae Kim Jaehyuk Huh and Jongse Park. 2021. SLO-Aware Inference Scheduler for Heterogeneous Processors in Edge Platforms. ACM Trans. Archit. Code Optim. 18 4 Article 43 (July 2021) 26\u00a0pages. 10.1145\/3460352","DOI":"10.1145\/3460352"},{"key":"e_1_3_3_2_97_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358302"},{"key":"e_1_3_3_2_98_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322230"},{"key":"e_1_3_3_2_99_2","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_3_2_100_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480128"},{"key":"e_1_3_3_2_101_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS61935.2024.10664810"},{"key":"e_1_3_3_2_102_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00077"},{"key":"e_1_3_3_2_103_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358307"},{"key":"e_1_3_3_2_104_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00096"},{"key":"e_1_3_3_2_105_2","doi-asserted-by":"publisher","unstructured":"Sam\u00a0(Likun) Xi Yuan Yao Kshitij Bhardwaj Paul Whatmough Gu-Yeon Wei and David Brooks. 2020. SMAUG: End-to-End Full-Stack Simulation Infrastructure for Deep Learning Workloads. ACM Trans. Archit. Code Optim. 17 4 Article 39 (nov 2020) 26\u00a0pages. 10.1145\/3424669","DOI":"10.1145\/3424669"},{"key":"e_1_3_3_2_106_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589059"},{"key":"e_1_3_3_2_107_2","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731412"},{"key":"e_1_3_3_2_108_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC53511.2021.00029"},{"key":"e_1_3_3_2_109_2","first-page":"989","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhuang Donglin","year":"2024","unstructured":"Donglin Zhuang, Zhen Zheng, Haojun Xia, Xiafei Qiu, Junjie Bai, Wei Lin, and Shuaiwen\u00a0Leon Song. 2024. MonoNN: Enabling a New Monolithic Optimization Space for Neural Network Inference Tasks on Modern GPU-Centric Architectures. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 989\u20131005. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/zhuang"}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756045","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:49:36Z","timestamp":1769464176000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756045"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":108,"alternative-id":["10.1145\/3725843.3756045","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756045","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}