{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T08:26:18Z","timestamp":1769502378972,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","funder":[{"name":"National Research Foundation, Singapore","award":["NRF-CRP23-2019-0003"],"award-info":[{"award-number":["NRF-CRP23-2019-0003"]}]},{"name":"Ministry of Education, Singapore","award":["MOE-MOET32024-0003"],"award-info":[{"award-number":["MOE-MOET32024-0003"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756091","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:19:56Z","timestamp":1760721596000},"page":"1221-1235","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Nexus Machine: An Energy-Efficient Active Message Inspired Reconfigurable Architecture"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6015-1084","authenticated-orcid":false,"given":"Rohan","family":"Juneja","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1339-6048","authenticated-orcid":false,"given":"Pranav","family":"Dangi","sequence":"additional","affiliation":[{"name":"National University Of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3413-136X","authenticated-orcid":false,"given":"Thilini Kaushalya","family":"Bandara","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4136-4188","authenticated-orcid":false,"given":"Tulika","family":"Mitra","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9010-6519","authenticated-orcid":false,"given":"Li-Shiuan","family":"Peh","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Zhenyu Bai Pranav Dangi Rohan Juneja Zhaoying Li Zhanglu Yan Huiying Lan and Tulika Mitra. [n. d.]. A Data-Driven Dynamic Execution Orchestration Architecture. https:\/\/www.comp.nus.edu.sg\/\u00a0tulika\/ASPLOS26.pdf Accessed: 2025-08-06."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD57390.2023.10323612"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","unstructured":"Gunnar Carlstedt and Mats Rimborg. 2023. Bubble NoC \u2013 A Low Energy Network-on-Chip with Small Footprint and High Performance. (2 2023). 10.36227\/techrxiv.17206295.v2","DOI":"10.36227\/techrxiv.17206295.v2"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/VLSITechnologyandCir46769.2022.9830509"},{"key":"e_1_3_3_1_7_2","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cuDNN: Efficient Primitives for Deep Learning. CoRR abs\/1410.0759 (2014). arXiv:https:\/\/arXiv.org\/abs\/1410.0759http:\/\/arxiv.org\/abs\/1410.0759"},{"key":"e_1_3_3_1_8_2","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cuDNN: Efficient Primitives for Deep Learning. arxiv:https:\/\/arXiv.org\/abs\/1410.0759\u00a0[cs.NE] https:\/\/arxiv.org\/abs\/1410.0759"},{"key":"e_1_3_3_1_9_2","unstructured":"NVIDIA Corporation. 2020. Structured Sparsity in the NVIDIA Ampere Architecture and Applications in Search Engines. https:\/\/developer.nvidia.com\/blog\/accelerating-inference-with-sparsity-using-ampere-and-tensorrt. NVIDIA Developer Blog."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358276"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","unstructured":"W.J. Dally J.A.S. Fiske J.S. Keen R.A. Lethin M.D. Noakes P.R. Nuth R.E. Davison and G.A. Fyler. 1992. The message-driven processor: a multicomputer processing node with efficient mechanisms. IEEE Micro 12 2 (1992) 23\u201339. 10.1109\/40.127581","DOI":"10.1109\/40.127581"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3656019.3689905"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.1992.753322"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","unstructured":"Yizhao Gao Baoheng Zhang Yuhao Ding and Hayden So. 2024. A Composable Dynamic Sparse Dataflow Architecture for Efficient Event-based Vision Processing on FPGA. 246\u2013257. 10.1145\/3626202.3637558","DOI":"10.1145\/3626202.3637558"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/139669.140384"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00084"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00046"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","unstructured":"S.C. Goldstein H. Schmit M. Budiu S. Cadambi M. Moe and R.R. Taylor. 2000. PipeRench: a reconfigurable architecture and compiler. Computer 33 4 (2000) 70\u201377. 10.1109\/2.839324","DOI":"10.1109\/2.839324"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358291"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.30"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358275"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","unstructured":"Olivia Hsu Alexander Rucker Tian Zhao Kunle Olukotun and Fredrik Kjolstad. 2022. Stardust: Compiling Sparse Tensor Algebra to a Reconfigurable Dataflow Architecture. CoRR abs\/2211.03251 (2022). 10.48550\/ARXIV.2211.03251 arXiv:https:\/\/arXiv.org\/abs\/2211.03251","DOI":"10.48550\/ARXIV.2211.03251"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582051"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_3_1_26_2","unstructured":"Rohan Juneja Pranav Dangi Thilini\u00a0Kaushalya Bandara Zhaoying Li Dhananjaya Wijerathne Li-Shiuan Peh and Tulika Mitra. 2025. Building an Open CGRA Ecosystem for Agile Innovation. arxiv:https:\/\/arXiv.org\/abs\/2508.19090\u00a0[cs.AR] https:\/\/arxiv.org\/abs\/2508.19090"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/NOCS50636.2020.9241586"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3061639.3062262"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","unstructured":"George Karypis and Vipin Kumar. 1998. A Fast and High Quality Multilevel Scheme for Partitioning Irregular Graphs. SIAM Journal on Scientific Computing 20 1 (1998) 359\u2013392. 10.1137\/S1064827595287997 arXiv:10.1137\/S1064827595287997","DOI":"10.1137\/S1064827595287997"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","unstructured":"Fredrik Kjolstad Shoaib Kamil Stephen Chou David Lugato and Saman Amarasinghe. 2017. The tensor algebra compiler. Proc. ACM Program. Lang. 1 OOPSLA Article 77 (oct 2017) 29\u00a0pages. 10.1145\/3133901","DOI":"10.1145\/3133901"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/VLSITechnologyandCir46783.2024.10631383"},{"key":"e_1_3_3_1_32_2","unstructured":"James Liu Pragaash Ponnusamy Tianle Cai Han Guo Yoon Kim and Ben Athiwaratkun. 2024. Training-Free Activation Sparsity in Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2408.14690\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2408.14690"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480125"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-45234-8_7"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582069"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISOCC62682.2024.10762131"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS61935.2024.10665106"},{"key":"e_1_3_3_1_38_2","volume-title":"On-Chip Networks, Second Edition","author":"Tushar\u00a0Krishna Li-Shiuan\u00a0Peh Natalie Enright\u00a0Jerger,","year":"2022","unstructured":"Li-Shiuan\u00a0Peh Natalie Enright\u00a0Jerger, Tushar\u00a0Krishna. 2022. On-Chip Networks, Second Edition. Springer Cham."},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3587135.3592196"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/215399.215455"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480048"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080255"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2014.6983052"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"crossref","unstructured":"Marcelo Orenes-Vera Esin Tureci David Wentzlaff and Margaret Martonosi. 2022. Dalorex: A Data-Local Program Execution and Architecture for Memory-bound Applications. 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA) (2022) 718\u2013730. https:\/\/api.semanticscholar.org\/CorpusID:251105150","DOI":"10.1109\/HPCA56546.2023.10071089"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00067"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","unstructured":"Angshuman Parashar Michael Pellauer Michael Adler Bushra Ahsan Neal Crago Daniel Lustig Vladimir Pavlov Antonia Zhai Mohit Gambhir Aamer Jaleel Randy Allmon Rachid Rayess Stephen Maresh and Joel Emer. 2014. Efficient Spatial Processing Element Control via Triggered Instructions. IEEE Micro 34 3 (2014) 120\u2013137. 10.1109\/MM.2014.14","DOI":"10.1109\/MM.2014.14"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080256"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v29i1.9277"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480047"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/859618.859667"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"crossref","unstructured":"Nathan Serafin Souradip Ghosh Harsh Desai Nathan Beckmann and Brandon Lucia. 2023. Pipestitch: An energy-minimal dataflow architecture with lightweight threads. IEEE Micro (2023).","DOI":"10.1145\/3613424.3614283"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00068"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2003.1253203"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00042"},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1145\/800076.802479"},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"e_1_3_3_1_57_2","volume-title":"Workshop on Open-Source EDA Technology (WOSET)","author":"Wijerathne Dhananjaya","year":"2022","unstructured":"Dhananjaya Wijerathne, Zhaoying Li, Manupa Karunaratne, Li-Shiuan Peh, and Tulika Mitra. 2022. Morpher: An Open-Source Integrated Compilation and Simulation Framework for CGRA. In Workshop on Open-Source EDA Technology (WOSET)."},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623793"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582047"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071027"},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"publisher","unstructured":"Baofen Yuan Jianfeng Zhu Xingchen Man Zijiao Ma Shouyi Yin Shaojun Wei and Leibo Liu. 2022. Dynamic-II Pipeline: Compiling Loops With Irregular Branches on Static-Scheduling CGRA. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 41 9 (2022) 2929\u20132942. 10.1109\/TCAD.2021.3121346","DOI":"10.1109\/TCAD.2021.3121346"},{"key":"e_1_3_3_1_62_2","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446702"},{"key":"e_1_3_3_1_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00030"},{"key":"e_1_3_3_1_64_2","unstructured":"Ningxin Zheng Huiqiang Jiang Quanlu Zhang Zhenhua Han Yuqing Yang Lingxiao Ma Fan Yang Chengruidong Zhang Lili Qiu Mao Yang and Lidong Zhou. 2023. PIT: Optimization of Dynamic Sparse Deep Learning Models via Permutation Invariant Transformation. arxiv:https:\/\/arXiv.org\/abs\/2301.10936\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2301.10936"}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756091","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:45:28Z","timestamp":1769463928000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756091"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":63,"alternative-id":["10.1145\/3725843.3756091","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756091","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}