{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T11:40:08Z","timestamp":1750506008165,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":91,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T00:00:00Z","timestamp":1750377600000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001459","name":"Ministry of Education - Singapore","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001459","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2246035"],"award-info":[{"award-number":["2246035"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100002418","name":"Intel Corporation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100002418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100016682","name":"VMware","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100016682","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,21]]},"DOI":"10.1145\/3695053.3731009","type":"proceedings-article","created":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T16:43:11Z","timestamp":1750437791000},"page":"1020-1034","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["The Sparsity-Aware LazyGPU Architecture"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9240-5926","authenticated-orcid":false,"given":"Changxi","family":"Liu","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7730-3437","authenticated-orcid":false,"given":"Miao","family":"Yu","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3532-6521","authenticated-orcid":false,"given":"Yifan","family":"Sun","sequence":"additional","affiliation":[{"name":"William &amp; Mary, Williamsburg, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8742-134X","authenticated-orcid":false,"given":"Trevor E.","family":"Carlson","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,20]]},"reference":[{"key":"e_1_3_3_1_2_2","volume-title":"USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek\u00a0Gordon Murray, Benoit Steiner, Paul\u00a0A. Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. TensorFlow: A System for Large-Scale Machine Learning. In USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_3_1_3_2","unstructured":"Advanced Micro Devices Inc.2015. AMD APP SDK. https:\/\/github.com\/ghostlander\/AMD-APP-SDK\/releases\/download\/v2.9.1\/AMD-APP-SDK-v2.9.1-lnx64.tar.xz"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783729"},{"key":"e_1_3_3_1_5_2","unstructured":"AMD. 2015. AMD Radeon R9 Series Gaming Graphics Cards with High-Bandwidth Memory."},{"key":"e_1_3_3_1_6_2","unstructured":"AMD. 2019. RDNA Architecture. https:\/\/GPUopen.com\/wp-content\/uploads\/2019\/08\/RDNA_Architecture_public.pdf."},{"volume-title":"AMD Radeon RX Graphics Cards","year":"2023","key":"e_1_3_3_1_7_2","unstructured":"AMD. 2023. AMD Radeon RX Graphics Cards."},{"volume-title":"Versal Adaptive SoC Programmable Network on Chip and Integrated Memory Controller 1.1 LogiCORE IP Product Guide (PG313)","year":"2024","key":"e_1_3_3_1_8_2","unstructured":"AMD. 2024. Versal Adaptive SoC Programmable Network on Chip and Integrated Memory Controller 1.1 LogiCORE IP Product Guide (PG313). https:\/\/docs.amd.com\/r\/en-US\/pg313-network-on-chip\/Read-Reorder-Buffer"},{"key":"e_1_3_3_1_9_2","volume-title":"International Symposium on Computer Architecture (ISCA)","author":"A\u015f\u0131l\u0131o\u011flu G\u00f6rkem","year":"2015","unstructured":"G\u00f6rkem A\u015f\u0131l\u0131o\u011flu, Zhaoxiang Jin, Murat K\u00f6ksal, Omkar Javeri, and Soner \u00d6nder. 2015. LaZy superscalar. In International Symposium on Computer Architecture (ISCA)."},{"key":"e_1_3_3_1_10_2","volume-title":"International Conference on Neural Information Processing Systems (NeurIPS)","author":"Baldi Pierre","year":"2013","unstructured":"Pierre Baldi and Peter Sadowski. 2013. Understanding dropout. In International Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3559009.3569666"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00015"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.16"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_3_1_15_2","volume-title":"USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie\u00a0Q. Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_3_1_16_2","volume-title":"USENIX Annual Technical Conference (USENIX ATC)","author":"Chen Yanhao","year":"2018","unstructured":"Yanhao Chen, Ari\u00a0B Hayes, Chi Zhang, Timothy Salmon, and Eddy\u00a0Z Zhang. 2018. Locality-Aware Software Throttling for Sparse Matrix Operation on GPUs. In USENIX Annual Technical Conference (USENIX ATC)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577500"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895592"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.Companion.2012.138"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2004.10008"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735702"},{"key":"e_1_3_3_1_22_2","unstructured":"Sambit Das Phani Motamarri Vishal Subramanian David\u00a0M Rogers and Vikram Gavini. 2022. DFT-FE 1.0: A massively parallel hybrid CPU-GPU density functional theory code using finite-element discretization. Computer Physics Communications (2022)."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00090"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2010.5470427"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3038228.3038239"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/1542275.1542288"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.23919\/DATE48585.2020.9116511"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Amir Gholami Zhewei Yao Sehoon Kim Coleman Hooper Michael\u00a0W Mahoney and Kurt Keutzer. 2024. AI and memory wall. IEEE Micro (2024).","DOI":"10.1109\/MM.2024.3373763"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00070"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/2458523.2458536"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339595"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00019"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00058"},{"key":"e_1_3_3_1_35_2","volume-title":"International Conference on Neural Information Processing Systems (NeurIPS)","author":"Han Song","year":"2015","unstructured":"Song Han, Jeff Pool, John Tran, and William\u00a0J Dally. 2015. Learning both weights and connections for efficient neural networks. In International Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2009.29"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337886"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2018.00024"},{"key":"e_1_3_3_1_40_2","volume-title":"Parallel Computing: On the Road to Exascale","author":"Krasnopolsky Boris","year":"2016","unstructured":"Boris Krasnopolsky and Alexey Medvedev. 2016. Acceleration of large scale OpenFOAM simulations on distributed systems with multicore CPUs and GPUs. In Parallel Computing: On the Road to Exascale."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5161058"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/2739480.2754652"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2010.44"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-70139-4"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173191"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623773"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00049"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304043"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2012.7476485"},{"key":"e_1_3_3_1_50_2","unstructured":"Xinxin Mei and Xiaowen Chu. 2016. Dissecting GPU memory hierarchy through microbenchmarking. Parallel and Distributed Systems (TPDS) (2016)."},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/2366231.2337168"},{"key":"e_1_3_3_1_52_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Merity Stephen","year":"2022","unstructured":"Stephen Merity, Caiming Xiong, James Bradbury, and Richard Socher. 2022. Pointer Sentinel Mixture Models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830790"},{"key":"e_1_3_3_1_54_2","unstructured":"Asit Mishra Jorge\u00a0Albericio Latorre Jeff Pool Darko Stosic Dusan Stosic Ganesh Venkatesh Chong Yu and Paulius Micikevicius. 2021. Accelerating sparse deep neural networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.08378 (2021)."},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623782"},{"key":"e_1_3_3_1_56_2","unstructured":"OpenAI. 2023. GPT-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00021"},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080254"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00063"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/2968456.2968476"},{"key":"e_1_3_3_1_61_2","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward\u00a0Z. Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_62_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322212"},{"key":"e_1_3_3_1_63_2","doi-asserted-by":"crossref","unstructured":"Arthur Perais. 2021. A Case for Speculative Strength Reduction. Computer Architecture Letters (CAL) (2021).","DOI":"10.1109\/LCA.2020.3048694"},{"key":"e_1_3_3_1_64_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835952"},{"key":"e_1_3_3_1_65_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056018"},{"key":"e_1_3_3_1_66_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480053"},{"key":"e_1_3_3_1_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_3_1_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322216"},{"key":"e_1_3_3_1_69_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056031"},{"key":"e_1_3_3_1_70_2","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155635"},{"key":"e_1_3_3_1_71_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01193"},{"key":"e_1_3_3_1_72_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2002.1003559"},{"key":"e_1_3_3_1_73_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Sun Mingjie","year":"2024","unstructured":"Mingjie Sun, Zhuang Liu, Anna Bair, and J.\u00a0Zico Kolter. 2024. A Simple and Effective Pruning Approach for Large Language Models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_74_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322230"},{"key":"e_1_3_3_1_75_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2016.7581262"},{"key":"e_1_3_3_1_76_2","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.14303"},{"key":"e_1_3_3_1_77_2","volume-title":"IEEE International Parallel and Distributed Processing Symposium (IPDPS)","author":"Tavana Mohammad\u00a0Khavari","year":"2019","unstructured":"Mohammad\u00a0Khavari Tavana, Yifan Sun, Nicolas\u00a0Bohm Agostini, and David Kaeli. 2019. Exploiting adaptive data compression to improve performance and energy-efficiency of compute workloads in multi-GPU systems. In IEEE International Parallel and Distributed Processing Symposium (IPDPS)."},{"key":"e_1_3_3_1_78_2","volume-title":"NVIDIA GPU Technology Conference","author":"Thomas-Collignon G","year":"2020","unstructured":"G Thomas-Collignon and V Mehta. 2020. Optimizing cuda applications for nvidia a100 GPU. In NVIDIA GPU Technology Conference."},{"key":"e_1_3_3_1_79_2","unstructured":"TOP500. 2025. TOP500 Supercomputer Sites. https:\/\/www.top500.org\/."},{"key":"e_1_3_3_1_80_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aur\u00e9lien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. CoRR (2023). arXiv:https:\/\/arXiv.org\/abs\/2302.13971"},{"key":"e_1_3_3_1_81_2","doi-asserted-by":"crossref","unstructured":"Henk\u00a0A Van\u00a0der Vorst. 1992. Bi-CGSTAB: A fast and smoothly converging variant of Bi-CG for the solution of nonsymmetric linear systems. SIAM Journal on scientific and Statistical Computing (1992).","DOI":"10.1137\/0913035"},{"key":"e_1_3_3_1_82_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00085"},{"key":"e_1_3_3_1_83_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00088"},{"key":"e_1_3_3_1_84_2","doi-asserted-by":"publisher","DOI":"10.1109\/RTSS46320.2019.00042"},{"key":"e_1_3_3_1_85_2","doi-asserted-by":"publisher","DOI":"10.1145\/3545008.3545042"},{"key":"e_1_3_3_1_86_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00029"},{"key":"e_1_3_3_1_87_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00082"},{"key":"e_1_3_3_1_88_2","doi-asserted-by":"crossref","unstructured":"Hongkuan Zhou Ajitesh Srivastava Hanqing Zeng Rajgopal Kannan and Viktor Prasanna. 2021. Accelerating large scale real-time GNN inference using channel pruning. Proceedings of the VLDB Endowment (2021).","DOI":"10.14778\/3461535.3461547"},{"key":"e_1_3_3_1_89_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507708"},{"key":"e_1_3_3_1_90_2","unstructured":"Michael Zhu and Suyog Gupta. 2017. To prune or not to prune: exploring the efficacy of pruning for model compression. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1710.01878 (2017)."},{"key":"e_1_3_3_1_91_2","unstructured":"Michael Zhu and Suyog Gupta. 2017. To prune or not to prune: exploring the efficacy of pruning for model compression. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1710.01878 (2017)."},{"key":"e_1_3_3_1_92_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358269"}],"event":{"name":"ISCA '25: Proceedings of the 52nd Annual International Symposium on Computer Architecture","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"],"location":"Tokyo Japan","acronym":"SIGARCH '25"},"container-title":["Proceedings of the 52nd Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731009","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731009","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T11:07:12Z","timestamp":1750504032000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695053.3731009"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,20]]},"references-count":91,"alternative-id":["10.1145\/3695053.3731009","10.1145\/3695053"],"URL":"https:\/\/doi.org\/10.1145\/3695053.3731009","relation":{},"subject":[],"published":{"date-parts":[[2025,6,20]]},"assertion":[{"value":"2025-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}