{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:10:12Z","timestamp":1765465812660,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","funder":[{"name":"Nation Research Foundation","award":["NRF-CRP23-2019-0003"],"award-info":[{"award-number":["NRF-CRP23-2019-0003"]}]},{"name":"Ministry of Education, Singapore","award":["MOE-MOET32024-0003"],"award-info":[{"award-number":["MOE-MOET32024-0003"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,22]]},"DOI":"10.1145\/3760250.3762226","type":"proceedings-article","created":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:06:36Z","timestamp":1765465596000},"page":"1-19","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["A Data-Driven Dynamic Execution Orchestration Architecture"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1143-0762","authenticated-orcid":false,"given":"Zhenyu","family":"Bai","sequence":"first","affiliation":[{"name":"School of Computing, National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1339-6048","authenticated-orcid":false,"given":"Pranav","family":"Dangi","sequence":"additional","affiliation":[{"name":"School of Computing, National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6015-1084","authenticated-orcid":false,"given":"Rohan","family":"Juneja","sequence":"additional","affiliation":[{"name":"School of Computing, National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7513-9494","authenticated-orcid":false,"given":"Zhaoying","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computing, National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7993-7127","authenticated-orcid":false,"given":"Zhanglu","family":"Yan","sequence":"additional","affiliation":[{"name":"School of Computing, National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3120-5773","authenticated-orcid":false,"given":"Huiying","family":"Lan","sequence":"additional","affiliation":[{"name":"Lumai Ltd., Oxford, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4136-4188","authenticated-orcid":false,"given":"Tulika","family":"Mitra","sequence":"additional","affiliation":[{"name":"School of Computing, National University of Singapore, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2025,12,11]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00023"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/12.48862"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3649329.3658488"},{"key":"e_1_3_2_1_4_1","volume-title":"Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150","author":"Beltagy Iz","year":"2020","unstructured":"Iz Beltagy, Matthew E Peters, and Arman Cohan. 2020. Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150 (2020)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/VLSITechnologyandCir46769.2022.9830509"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2016.2616357"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASAP.2017.7995277"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00053"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358276"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3656019.3689905"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2000064.2000108"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00084"},{"key":"e_1_3_2_1_14_1","volume-title":"2022 55th IEEE\/ACM International Symposium on Microarchitecture (MICRO). IEEE, 546--564","author":"Gobieski Graham","year":"2022","unstructured":"Graham Gobieski, Souradip Ghosh, Marijn Heule, Todd Mowry, Tony Nowatzki, Nathan Beckmann, and Brandon Lucia. 2022. Riptide: A programmable, energy-minimal dataflow compiler and architecture. In 2022 55th IEEE\/ACM International Symposium on Microarchitecture (MICRO). IEEE, 546--564."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/355791.355796"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2014.6757323"},{"key":"e_1_3_2_1_17_1","volume-title":"Onchip networks","author":"Jerger Natalie Enright","unstructured":"Natalie Enright Jerger, Tushar Krishna, and Li-Shiuan Peh. 2017. Onchip networks. Morgan & Claypool Publishers."},{"key":"e_1_3_2_1_18_1","volume-title":"Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al.","author":"Jiang Albert Q","year":"2023","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. 2023. Mistral 7B. arXiv preprint arXiv:2310.06825 (2023)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Norman P. Jouppi George Kurian Sheng Li Peter Ma Rahul Nagarajan Lifeng Nai Nishant Patil Suvinay Subramanian Andy Swing Brian Towles Cliff Young Xiang Zhou Zongwei Zhou and David Patterson. 2023. TPU v4: An Optically Reconfigurable Supercomputer for Machine Learning with Hardware Support for Embeddings. arXiv:2304.01433 [cs.AR] https:\/\/arxiv.org\/abs\/2304.01433","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_2_1_20_1","volume-title":"Tulika Mitra, and Li shiuan Peh.","author":"Juneja Rohan","year":"2025","unstructured":"Rohan Juneja, Pranav Dangi, Thilini Kaushalya Bandara, Tulika Mitra, and Li shiuan Peh. 2025. Nexus Machine: An Active Message Inspired Reconfigurable Architecture for Irregular Workloads. arXiv:2502.12380 [cs.AR] https:\/\/arxiv.org\/abs\/2502.12380"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480051"},{"key":"e_1_3_2_1_22_1","volume-title":"Technology Conference on Performance Evaluation and Benchmarking. Springer, 24--41","author":"Karimov Jeyhun","year":"2018","unstructured":"Jeyhun Karimov, Tilmann Rabl, and Volker Markl. 2018. Polybench: The first benchmark for polystores. In Technology Conference on Performance Evaluation and Benchmarking. Springer, 24--41."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the 54th Annual Design Automation Conference","author":"Karunaratne Manupa","year":"2017","unstructured":"Manupa Karunaratne, Aditi Kulkarni Mohite, Tulika Mitra, and Li-Shiuan Peh. 2017. HyCUBE: A CGRA with reconfigurable singlecycle multi-hop interconnect. In Proceedings of the 54th Annual Design Automation Conference 2017. 1--6."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00040"},{"key":"e_1_3_2_1_25_1","volume-title":"Rohan Juneja, Cheng Tan, Zhenyu Bai, and Tulika Mitra.","author":"Li Zhaoying","year":"2024","unstructured":"Zhaoying Li, Chenyang Yin, Thilini Kaushalya Bandara, Rohan Juneja, Cheng Tan, Zhenyu Bai, and Tulika Mitra. 2024. Enhancing CGRA Efficiency Through Aligned Compute and Communication Provisioning. arXiv preprint arXiv:2412.08137 (2024)."},{"key":"e_1_3_2_1_26_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Training-free activation sparsity in large language models. arXiv preprint arXiv:2408.14690","author":"Liu James","year":"2024","unstructured":"James Liu, Pragaash Ponnusamy, Tianle Cai, Han Guo, Yoon Kim, and Ben Athiwaratkun. 2024. Training-free activation sparsity in large language models. arXiv preprint arXiv:2408.14690 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3466"},{"key":"e_1_3_2_1_30_1","volume-title":"Erwin Laure, Ivy Bo Peng, and Jeffrey S Vetter.","author":"Markidis Stefano","year":"2018","unstructured":"Stefano Markidis, Steven Wei Der Chien, Erwin Laure, Ivy Bo Peng, and Jeffrey S Vetter. 2018. Nvidia tensor core programmability, performance & precision. In 2018 IEEE international parallel and distributed processing symposium workshops (IPDPSW). IEEE, 522--531."},{"key":"e_1_3_2_1_31_1","first-page":"124","article-title":"An In-depth Study on the Performance Impact of CUDA, OpenCL, and PTX Code","volume":"10","author":"Memarzia Puya","year":"2015","unstructured":"Puya Memarzia and Farshad Khunjush. 2015. An In-depth Study on the Performance Impact of CUDA, OpenCL, and PTX Code. J. Inf. Comput. Sci 10, 2 (2015), 124--136.","journal-title":"J. Inf. Comput. Sci"},{"key":"e_1_3_2_1_32_1","volume-title":"Oncel Tuzel, Golnoosh Samei, Mohammad Rastegari, and Mehrdad Farajtabar.","author":"Mirzadeh Iman","year":"2023","unstructured":"Iman Mirzadeh, Keivan Alizadeh, Sachin Mehta, Carlo C Del Mundo, Oncel Tuzel, Golnoosh Samei, Mohammad Rastegari, and Mehrdad Farajtabar. 2023. Relu strikes back: Exploiting activation sparsity in large language models. arXiv preprint arXiv:2310.04564 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Jeff Pool, Darko Stosic, Dusan Stosic, Ganesh Venkatesh, Chong Yu, and Paulius Micikevicius.","author":"Mishra Asit","year":"2021","unstructured":"Asit Mishra, Jorge Albericio Latorre, Jeff Pool, Darko Stosic, Dusan Stosic, Ganesh Venkatesh, Chong Yu, and Paulius Micikevicius. 2021. Accelerating Sparse Deep Neural Networks. arXiv:2104.08378 [cs.LG] https:\/\/arxiv.org\/abs\/2104.08378"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582069"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISOCC62682.2024.10762131"},{"key":"e_1_3_2_1_36_1","volume-title":"2024 IEEE Hot Chips 36 Symposium (HCS). IEEE, 1--1.","author":"Nambiar Vishnu P","year":"2024","unstructured":"Vishnu P Nambiar, Yi Sheng Chong, Thilini Kaushalya Bandara, Dhananjaya Wijerathne, Zhaoying Li, Rohan Juneja, Li-Shiuan Peh, Tulika Mitra, and Anh Tuan Do. 2024. PACE: A scalable and energy efficient CGRA in a RISC-V SoC for edge computing applications. In 2024 IEEE Hot Chips 36 Symposium (HCS). IEEE, 1--1."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480048"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080255"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.60"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/hpca56546.2023.10071089"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304025"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3140659.3080256"},{"key":"e_1_3_2_1_43_1","volume-title":"RETROSPECTIVE: Plasticine: A Reconfigurable Architecture For Parallel Paterns.","author":"Prabhakar Raghu","year":"2023","unstructured":"Raghu Prabhakar, Yaqi Zhang, David Koeplinger, Matt Feldman, Tian Zhao, Stefan Hadjis, Ardavan Pedram, Christos Kozyrakis, and Kunle Olukotun. 2023. RETROSPECTIVE: Plasticine: A Reconfigurable Architecture For Parallel Paterns. (2023). https:\/\/bit.ly\/isca50_retrospective"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480047"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/871656.859667"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533040"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614283"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446703"},{"key":"e_1_3_2_1_50_1","volume-title":"Fifth Workshop on Open-Source EDA Technology (WOSET).","author":"Wijerathne Dhananjaya","year":"2022","unstructured":"Dhananjaya Wijerathne, Zhaoying Li, Manupa Karunaratne, Li-Shiuan Peh, and Tulika Mitra. 2022. Morpher: An open-source integrated compilation and simulation framework for cgra. In Fifth Workshop on Open-Source EDA Technology (WOSET)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071027"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Jingyang Yuan Huazuo Gao Damai Dai Junyu Luo Liang Zhao Zhengyan Zhang Zhenda Xie YX Wei Lean Wang Zhiping Xiao et al. 2025. Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention. arXiv preprint arXiv:2502.11089 (2025).","DOI":"10.18653\/v1\/2025.acl-long.1126"},{"key":"e_1_3_2_1_53_1","volume-title":"Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, et al.","author":"Zaheer Manzil","year":"2020","unstructured":"Manzil Zaheer, Guru Guruganesh, Kumar Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, et al. 2020. Big bird: Transformers for longer sequences. Advances in neural information processing systems 33 (2020), 17283--17297."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446702"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322249"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322249"},{"key":"e_1_3_2_1_57_1","volume-title":"Learning n: m finegrained structured sparse neural networks from scratch. arXiv preprint arXiv:2102.04010","author":"Zhou Aojun","year":"2021","unstructured":"Aojun Zhou, Yukun Ma, Junnan Zhu, Jianbo Liu, Zhijie Zhang, Kun Yuan, Wenxiu Sun, and Hongsheng Li. 2021. Learning n: m finegrained structured sparse neural networks from scratch. arXiv preprint arXiv:2102.04010 (2021)."}],"event":{"name":"ASPLOS '26:31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Pittsburgh PA USA","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3760250.3762226","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:07:32Z","timestamp":1765465652000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3760250.3762226"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,11]]},"references-count":57,"alternative-id":["10.1145\/3760250.3762226","10.1145\/3760250"],"URL":"https:\/\/doi.org\/10.1145\/3760250.3762226","relation":{},"subject":[],"published":{"date-parts":[[2025,12,11]]},"assertion":[{"value":"2025-12-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}