{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:25:06Z","timestamp":1773318306421,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759820","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:05:39Z","timestamp":1762963539000},"page":"1495-1509","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["SparStencil: Retargeting Sparse Tensor Cores to Scientific Stencil Computations via Structured Sparsity Transformation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-2565-7749","authenticated-orcid":false,"given":"Qi","family":"Li","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1013-1325","authenticated-orcid":false,"given":"Kun","family":"Li","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3075-3385","authenticated-orcid":false,"given":"Haozhi","family":"Han","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3406-2907","authenticated-orcid":false,"given":"Liang","family":"Yuan","sequence":"additional","affiliation":[{"name":"Chinee Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7520-9640","authenticated-orcid":false,"given":"Yunquan","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2392-8472","authenticated-orcid":false,"given":"Yifeng","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6487-3658","authenticated-orcid":false,"given":"Junshi","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3900-3722","authenticated-orcid":false,"given":"Hong","family":"An","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9107-013X","authenticated-orcid":false,"given":"Ting","family":"Cao","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6455-3898","authenticated-orcid":false,"given":"Mao","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_3_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.9"},{"key":"e_1_3_3_3_3_2","unstructured":"Krste Asanovic Ras Bodik Bryan\u00a0Christopher Catanzaro Joseph\u00a0James Gebis Parry Husbands Kurt Keutzer David\u00a0A Patterson William\u00a0Lester Plishker John Shalf Samuel\u00a0Webb Williams et\u00a0al. 2006. The landscape of parallel computing research: A view from berkeley. (2006)."},{"key":"e_1_3_3_3_4_2","doi-asserted-by":"publisher","unstructured":"Krste Asanovic Rastislav Bodik James Demmel Tony Keaveny Kurt Keutzer John Kubiatowicz Nelson Morgan David Patterson Koushik Sen John Wawrzynek David Wessel and Katherine Yelick. 2009. A View of the Parallel Computing Landscape. Commun. ACM 52 10 (oct 2009) 56\u201367. 10.1145\/1562764.1562783","DOI":"10.1145\/1562764.1562783"},{"key":"e_1_3_3_3_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.107"},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00078"},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"publisher","unstructured":"Uday Bondhugula Vinayaka Bandishti and Irshad Pananilath. 2017. Diamond Tiling: Tiling Techniques to Maximize Parallelism for Stencil Computations. IEEE Transactions on Parallel and Distributed Systems 28 5 (May 2017) 1285\u20131298. 10.1109\/TPDS.2016.2615094","DOI":"10.1109\/TPDS.2016.2615094"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/1375581.1375595"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356162"},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638476"},{"key":"e_1_3_3_3_11_2","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1410.0759 (2014)."},{"key":"e_1_3_3_3_12_2","unstructured":"Alain Denzler Rahul Bera Nastaran Hajinazar Gagandeep Singh Geraldo\u00a0F. Oliveira Juan G\u00f3mez-Luna and Onur Mutlu. 2023. Casper: Accelerating Stencil Computation using Near-cache Processing. arxiv:https:\/\/arXiv.org\/abs\/2112.14216\u00a0[cs.AR] https:\/\/arxiv.org\/abs\/2112.14216"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","unstructured":"Jack Edmonds. 1965. Paths Trees and Flowers. Canadian Journal of Mathematics 17 (1965) 449\u2013467. 10.4153\/CJM-1965-045-4","DOI":"10.4153\/CJM-1965-045-4"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/SYNASC.2014.70"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/2458523.2458526"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","unstructured":"Tobias Gysi Christoph M\u00fcller Oleksandr Zinenko Stephan Herhut Eddie Davis Tobias Wicky Oliver Fuhrer Torsten Hoefler and Tobias Grosser. 2021. Domain-Specific Multi-Level IR Rewriting for GPU: The Open Earth Compiler for GPU-Accelerated Climate Simulation. ACM Trans. Archit. Code Optim. 18 4 Article 51 (sep 2021) 23\u00a0pages. 10.1145\/3469030","DOI":"10.1145\/3469030"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3710848.3710897"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-19861-8_13"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/2464996.2467268"},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/2304576.2304619"},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"publisher","unstructured":"H.T. Huynh Z.J. Wang and P.E. Vincent. 2014. High-order methods for computational fluid dynamics: A brief review of compact differential formulations on unstructured grids. Computers & Fluids 98 (2014) 209\u2013220. 10.1016\/j.compfluid.2013.12.00712th USNCCM mini-symposium of High-Order Methods for Computational Fluid Dynamics - A special issue dedicated to the 80th birthday of Professor Antony Jameson.","DOI":"10.1016\/j.compfluid.2013.12.007"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00035"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/582034.582077"},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476154"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532392"},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"publisher","unstructured":"David\u00a0J. Lusher Satya\u00a0P. Jammy and Neil\u00a0D. Sandham. 2021. OpenSBLI: Automated code-generation for heterogeneous computing architectures applied to compressible fluid dynamics on structured grids. Computer Physics Communications 267 (2021) 108063. 10.1016\/j.cpc.2021.108063","DOI":"10.1016\/j.cpc.2021.108063"},{"key":"e_1_3_3_3_27_2","first-page":"89","volume-title":"Proceedings of the 1st international workshop on high-performance stencil computations, Vienna","author":"Maruyama Naoya","year":"2014","unstructured":"Naoya Maruyama and Takayuki Aoki. 2014. Optimizing stencil computations for NVIDIA Kepler GPUs. In Proceedings of the 1st international workshop on high-performance stencil computations, Vienna. Citeseer, 89\u201395."},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063398"},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3368826.3377904"},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/1542275.1542313"},{"key":"e_1_3_3_3_31_2","unstructured":"Nvidia. 2023. cuDNN. https:\/\/developer.nvidia.com\/cudnn Last accessed on 2023-7-24."},{"key":"e_1_3_3_3_32_2","volume-title":"NVIDIA A100 Tensor Core GPU Datasheet","author":"Corporation NVIDIA","year":"2020","unstructured":"NVIDIA Corporation. 2020. NVIDIA A100 Tensor Core GPU Datasheet. https:\/\/www.nvidia.com\/en-us\/data-center\/a100\/ Accessed: 2024-11-21."},{"key":"e_1_3_3_3_33_2","volume-title":"Parallel Thread Execution ISA Version 8.5","author":"Corporation NVIDIA","year":"2024","unstructured":"NVIDIA Corporation. 2024. Parallel Thread Execution ISA Version 8.5. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html Accessed: 2024-11-21."},{"key":"e_1_3_3_3_34_2","unstructured":"Jeff Pool Abhishek Sawarkar and Jay Rodge. 2021. Accelerating Inference with Sparsity Using the NVIDIA Ampere Architecture and NVIDIA TensorRT. https:\/\/developer.nvidia.com\/blog\/accelerating-inference-with-sparsity-using-ampere-and-tensorrt\/ Accessed: 2024-11-22."},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_3_3_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/2830018.2830025"},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00049"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"publisher","unstructured":"Prashant\u00a0Singh Rawat Miheer Vaidya Aravind Sukumaran-Rajam Mahesh Ravishankar Vinod Grover Atanas Rountev Louis-No\u00ebl Pouchet and P. Sadayappan. 2018. Domain-Specific Optimization and Generation of High-Performance GPU Code for Stencil Computations. Proc. IEEE 106 11 (2018) 1902\u20131920. 10.1109\/JPROC.2018.2862896","DOI":"10.1109\/JPROC.2018.2862896"},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00073"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2000.10015"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","unstructured":"Kevin Stock Martin Kong Tobias Grosser Louis-No\u00ebl Pouchet Fabrice Rastello J. Ramanujam and P. Sadayappan. 2014. A Framework for Enhancing Data Reuse via Associative Reordering. SIGPLAN Not. 49 6 (jun 2014) 65\u201376. 10.1145\/2666356.2594342","DOI":"10.1145\/2666356.2594342"},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"publisher","unstructured":"Sven Verdoolaege Juan Carlos\u00a0Juega Albert Cohen Jos\u00e9 Ignacio\u00a0G\u00f3mez Christian Tenllado and Francky Catthoor. 2013. Polyhedral Parallel Code Generation for CUDA. ACM Trans. Archit. Code Optim. 9 4 Article 54 (jan 2013) 23\u00a0pages. 10.1145\/2400682.2400713","DOI":"10.1145\/2400682.2400713"},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00011"},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","unstructured":"David Wonnacott. 2002. Achieving Scalable Locality with Time Skewing. Int. J. Parallel Program. 30 3 (jun 2002) 181\u2013221. 10.1023\/A:1015460304860","DOI":"10.1023\/A:1015460304860"},{"key":"e_1_3_3_3_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC-DSS-SmartCity-DependSys53884.2021.00036"},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337835"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126920"},{"key":"e_1_3_3_3_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593705"},{"key":"e_1_3_3_3_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593716"},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00059"},{"key":"e_1_3_3_3_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356210"},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441598"},{"key":"e_1_3_3_3_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC.2018.00009"},{"key":"e_1_3_3_3_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527440"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759820","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:40:35Z","timestamp":1773254435000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759820"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":53,"alternative-id":["10.1145\/3712285.3759820","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759820","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}