{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,2]],"date-time":"2026-07-02T04:30:27Z","timestamp":1782966627200,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":78,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3725745","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"119-134","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["StructILU: Dependency-Preserving Incomplete LU with Hierarchical Parallelism for Structured Grid PDEs on GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-1038-944X","authenticated-orcid":false,"given":"Hao","family":"Luo","sequence":"first","affiliation":[{"name":"School of Mathematical Sciences, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5021-2912","authenticated-orcid":false,"given":"Qianchao","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of Mathematical Sciences, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2127-6011","authenticated-orcid":false,"given":"Xiaochen","family":"Hao","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0807-0029","authenticated-orcid":false,"given":"Chunxi","family":"Lei","sequence":"additional","affiliation":[{"name":"School of Mathematical Sciences, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4102-6683","authenticated-orcid":false,"given":"Chengdi","family":"Ma","sequence":"additional","affiliation":[{"name":"School of Mathematical Sciences, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4276-0510","authenticated-orcid":false,"given":"Chenchen","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Mathematical Sciences, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9076-7998","authenticated-orcid":false,"given":"Yun","family":"Liang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7426-6248","authenticated-orcid":false,"given":"Chao","family":"Yang","sequence":"additional","affiliation":[{"name":"School of Mathematical Sciences, Peking University, Beijing, China and PKU-Changsha Institute for Computing and Digital Economy, Changsha, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-57675-2_33"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Edward Anderson and Youcef Saad. 1989. Solving sparse triangular linear systems on parallel computers. International Journal of High Speed Computing 1 01 (1989) 73\u201395.","DOI":"10.1142\/S0129053389000056"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Hartwig Anzt Terry Cojean Goran Flegar Fritz G\u00f6bel Thomas Gr\u00fctzmacher Pratik Nayak Tobias Ribizel Yuhsiang\u00a0Mike Tsai and Enrique\u00a0S Quintana-Ort\u00ed. 2022. Ginkgo: A modern linear operator algebra framework for high performance computing. ACM Transactions on Mathematical Software (TOMS) 48 1 (2022) 1\u201333.","DOI":"10.1145\/3480935"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00033"},{"key":"e_1_3_3_1_6_2","unstructured":"Michele Benzi Wayne Joubert and Gabriel Mateescu. 1999. Numerical experiments with parallel orderings for ILU preconditioners. Electronic Transactions on Numerical Analysis 8 (1999) 88\u2013114."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611976311"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Huanqi Cao Shizhi Tang Qianchao Zhu Bowen Yu and Wenguang Chen. 2023. Mat2Stencil: A Modular Matrix-Based DSL for Explicit and Implicit Matrix-Free PDE Solvers on Structured Grid. Proceedings of the ACM on Programming Languages 7 OOPSLA2 (2023) 686\u2013715.","DOI":"10.1145\/3622822"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Arielle Carr Eric de Sturler and Serkan Gugercin. 2021. Preconditioning parametrized linear systems. SIAM Journal on Scientific Computing 43 3 (2021) A2242\u2013A2267.","DOI":"10.1137\/20M1331123"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651379"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638476"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Jack Choquette Wishwesh Gandhi Olivier Giroux Nick Stam and Ronny Krashinsky. 2021. Nvidia a100 tensor core gpu: Performance and innovation. IEEE Micro 41 2 (2021) 29\u201335.","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-20119-1_1"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Edmond Chow and Aftab Patel. 2015. Fine-grained parallel incomplete LU factorization. SIAM journal on Scientific Computing 37 2 (2015) C169\u2013C193.","DOI":"10.1137\/140968896"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Cu Cui. 2024. Acceleration of tensor-product operations with tensor cores. ACM Transactions on Parallel Computing 11 4 (2024) 1\u201324.","DOI":"10.1145\/3695466"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.5555\/264989"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Shun Doi and Takumi Washio. 1999. Ordering strategies and related techniques to overcome the trade-off between parallelism and convergence in incomplete factorizations. Parallel Comput. 25 13-14 (1999) 1995\u20132014.","DOI":"10.1016\/S0167-8191(99)00064-2"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-23397-5_8"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2018.00101"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/PDP2018.2018.00034"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Ernesto Dufrechou Pablo Ezzatti Manuel Freire and Enrique\u00a0S Quintana-Ort\u00ed. 2021. Machine learning for optimal selection of sparse triangular system solvers on GPUs. J. Parallel and Distrib. Comput. 158 (2021) 47\u201355.","DOI":"10.1016\/j.jpdc.2021.07.013"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/SBAC-PAD.2019.00020"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Manuel Freire Juan Ferrand Franco Seveso Ernesto Dufrechou and Pablo Ezzatti. 2023. A GPU method for the analysis stage of the SPTRSV kernel. The Journal of Supercomputing 79 13 (2023) 15051\u201315078.","DOI":"10.1007\/s11227-023-05238-8"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-36569-9_13"},{"key":"e_1_3_3_1_25_2","first-page":"333","volume-title":"Proceedings of Parallel CFD","volume":"97","author":"Gropp William\u00a0D","year":"1997","unstructured":"William\u00a0D Gropp, LC McInnes, MD Tidriri, and DE Keyes. 1997. Parallel implicit PDE computations: Algorithms and software. In Proceedings of Parallel CFD , Vol.\u00a097. Citeseer, 333\u2013344."},{"key":"e_1_3_3_1_26_2","unstructured":"Zhengding Hu Jingwei Sun Zhongyang Li and Guangzhong Sun. 2024. AG-SpTRSV: An Automatic Framework to Optimize Sparse Triangular Solve on GPUs. ACM Transactions on Architecture and Code Optimization (2024)."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Carlo Janna Massimilano Ferronato and Giuseppe Gambolati. 2010. A block FSAI-ILU parallel preconditioner for symmetric positive definite linear systems. SIAM Journal on Scientific Computing 32 5 (2010) 2468\u20132484.","DOI":"10.1137\/090779760"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Wayne Joubert and Thomas Oppe. 1994. Improved SSOR and incomplete Cholesky solution of linear equations on shared memory and distributed memory parallel computers. Numerical Linear Algebra with Applications 1 3 (1994) 287\u2013311.","DOI":"10.1002\/nla.1680010306"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Aditya Kashi and Sivakumaran Nadarajah. 2021. An asynchronous incomplete block LU preconditioner for computational fluid dynamics on unstructured grids. SIAM Journal on Scientific Computing 43 1 (2021) C1\u2013C30.","DOI":"10.1137\/19M1301084"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"crossref","unstructured":"Cetin\u00a0C Kiris Dochan Kwak and Stuart\u00a0E Rogers. 2003. Incompressible Navier-Stokes solvers in primitive variables and their applications to steady and unsteady flow simulations. Numerical Simulations of Incompressible Flows (2003) 3\u201334.","DOI":"10.1142\/9789812796837_0001"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Byungjoon Lee and Chohong Min. 2021. Optimal preconditioners on solving the Poisson equation with Neumann boundary conditions. J. Comput. Phys. 433 (2021) 110189.","DOI":"10.1016\/j.jcp.2021.110189"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-0348-8629-1"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Jianjiang Li Jiabi Liang Wei Xue Zhengding Hu Lin Li and Jinliang Shi. 2024. Toward efficient structured-grid triangular solver on sunway many-core processors. The Journal of Supercomputing 80 8 (2024) 10610\u201310636.","DOI":"10.1007\/s11227-023-05802-2"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Ruipeng Li and Yousef Saad. 2013. GPU-accelerated preconditioned iterative linear solvers. The Journal of Supercomputing 63 (2013) 443\u2013466.","DOI":"10.1007\/s11227-012-0825-3"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611976137.10"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Yibao Li Lujing Zhang Qing Xia Qian Yu and Junseok Kim. 2021. An unconditionally energy-stable second-order time-accurate numerical scheme for the coupled Cahn\u2013Hilliard system in copolymer\/homopolymer mixtures. Computational Materials Science 200 (2021) 110809.","DOI":"10.1016\/j.commatsci.2021.110809"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"Zhongze Li and Yousef Saad. 2006. SchurRAS: A restricted version of the overlapping Schur complement preconditioner. SIAM Journal on Scientific Computing 27 5 (2006) 1787\u20131801.","DOI":"10.1137\/040608350"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-43659-3_45"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532392"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607092"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00058"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"Zhengyang Lu and Weifeng Liu. 2023. Tilesptrsv: a tiled algorithm for parallel sparse triangular solve on gpus. CCF Transactions on High Performance Computing 5 2 (2023) 129\u2013143.","DOI":"10.1007\/s42514-023-00151-1"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3404397.3404413"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"crossref","unstructured":"Lixiang Luo Jack\u00a0R Edwards Hong Luo and Frank Mueller. 2015. A fine-grained block ILU scheme on regular structures for GPGPUs. Computers & Fluids 119 (2015) 149\u2013161.","DOI":"10.1016\/j.compfluid.2015.07.005"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Wenpeng Ma and Xiao-Chuan Cai. 2021. Point-block incomplete LU preconditioning with asynchronous iterations on GPU for multiphysics problems. The International Journal of High Performance Computing Applications 35 2 (2021) 121\u2013135.","DOI":"10.1177\/1094342020981153"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"crossref","unstructured":"J\u00a0Andvandervorst Meijerink and Henk\u00a0A Van Der\u00a0Vorst. 1977. An iterative solution method for linear systems of which the coefficient matrix is a symmetric M-matrix. Math. Comp. 31 137 (1977) 148\u2013162.","DOI":"10.1090\/S0025-5718-1977-0438681-4"},{"key":"e_1_3_3_1_47_2","unstructured":"Maxim Naumov. 2011. Parallel solution of sparse triangular linear systems in the preconditioned iterative methods on the GPU. NVIDIA Corp. Westford MA USA Tech. Rep. NVR-2011 1 (2011)."},{"key":"e_1_3_3_1_48_2","unstructured":"Maxim Naumov. 2012. Parallel incomplete-LU and Cholesky factorization in the preconditioned iterative methods on the GPU. Nvidia Technical Report NVR-2012-003 (2012)."},{"key":"e_1_3_3_1_49_2","unstructured":"Maxim Naumov Patrice Castonguay and Jonathan Cohen. 2015. Parallel graph coloring with applications to the incomplete-lu factorization on the gpu. Nvidia White Paper (2015)."},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"crossref","unstructured":"Natalia\u00a0K Nikolova Helen\u00a0W Tam and Mohamed\u00a0H Bakr. 2004. Sensitivity analysis with the FDTD method on structured grids. IEEE Transactions on Microwave Theory and Techniques 52 4 (2004) 1207\u20131216.","DOI":"10.1109\/TMTT.2004.825710"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-07518-1_8"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"crossref","unstructured":"Eugene\u00a0L Poole and James\u00a0M Ortega. 1987. Multicolor ICCG methods for vector computers. SIAM J. Numer. Anal. 24 6 (1987) 1394\u20131418.","DOI":"10.1137\/0724090"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.5555\/829576"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"crossref","unstructured":"Joel\u00a0H Saltz. 1990. Aggregation methods for solving sparse triangular systems on multiprocessors. SIAM J. Sci. Statist. Comput. 11 1 (1990) 123\u2013144.","DOI":"10.1137\/0911008"},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"crossref","unstructured":"Barry Smith and Hong Zhang. 2011. Sparse triangular solves for ILU revisited: Data layout crucial to better performance. The International Journal of High Performance Computing Applications 25 4 (2011) 386\u2013391.","DOI":"10.1177\/1094342010389857"},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-94-011-5412-3_8"},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/3404397.3404400"},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPPW.2012.23"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"crossref","unstructured":"Kengo Suzuki Takeshi Fukaya and Takeshi Iwashita. 2023. A novel ILU preconditioning method with a block structure suitable for SIMD vectorization. J. Comput. Appl. Math. 419 (2023) 114687.","DOI":"10.1016\/j.cam.2022.114687"},{"key":"e_1_3_3_1_60_2","unstructured":"Ruhollah Tavakoli. 2010. Parallelizing Sequential Sweeping on Structured Grids\u2013Fully Parallel SOR\/ILU preconditioners for Structured n-Diagonal Matrices. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1008.3699 (2010)."},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"crossref","unstructured":"Henk\u00a0A van\u00a0der Vorst. 1989. High performance preconditioning. SIAM J. Sci. Statist. Comput. 10 6 (1989) 1174\u20131185.","DOI":"10.1137\/0910071"},{"key":"e_1_3_3_1_62_2","doi-asserted-by":"publisher","DOI":"10.1145\/3061639.3062185"},{"key":"e_1_3_3_1_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178513"},{"key":"e_1_3_3_1_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3225058.3225071"},{"key":"e_1_3_3_1_65_2","doi-asserted-by":"crossref","unstructured":"Andrew\u00a0M Wissink Anastasios\u00a0S Lyrintzis and Roger\u00a0C Strawn. 1996. Parallelization of a three-dimensional flow solver for Euler rotorcraft aerodynamics predictions. AIAA Journal 34 11 (1996) 2276\u20132283.","DOI":"10.2514\/3.13391"},{"key":"e_1_3_3_1_66_2","doi-asserted-by":"crossref","unstructured":"Yuanzhe Xi and Yousef Saad. 2017. A rational function preconditioner for indefinite sparse linear systems. SIAM Journal on Scientific Computing 39 3 (2017) A1145\u2013A1167.","DOI":"10.1137\/16M1078409"},{"key":"e_1_3_3_1_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3472456.3472478"},{"key":"e_1_3_3_1_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/3078597.3078602"},{"key":"e_1_3_3_1_69_2","unstructured":"Tianshi Xu Ruipeng Li and Daniel Osei-Kuffuor. 2023. A two-level GPU-accelerated incomplete LU preconditioner for general sparse linear systems. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08881 (2023)."},{"key":"e_1_3_3_1_70_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.81"},{"key":"e_1_3_3_1_71_2","doi-asserted-by":"publisher","DOI":"10.1145\/3404397.3404428"},{"key":"e_1_3_3_1_72_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.5"},{"key":"e_1_3_3_1_73_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00065"},{"key":"e_1_3_3_1_74_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC-DSS-SmartCity-DependSys53884.2021.00036"},{"key":"e_1_3_3_1_75_2","volume-title":"Iterative solution of large linear systems","author":"Young David\u00a0M","year":"2014","unstructured":"David\u00a0M Young. 2014. Iterative solution of large linear systems. Elsevier."},{"key":"e_1_3_3_1_76_2","doi-asserted-by":"crossref","unstructured":"Feng Zhang Jiya Su Weifeng Liu Bingsheng He Ruofan Wu Xiaoyong Du and Rujia Wang. 2021. Yuenyeungsptrsv: a thread-level and warp-level fusion synchronization-free sparse triangular solve. IEEE Transactions on Parallel and Distributed Systems 32 9 (2021) 2321\u20132337.","DOI":"10.1109\/TPDS.2021.3066635"},{"key":"e_1_3_3_1_77_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476158"},{"key":"e_1_3_3_1_78_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638482"},{"key":"e_1_3_3_1_79_2","doi-asserted-by":"publisher","DOI":"10.1145\/3673038.3673040"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3725745","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:03:20Z","timestamp":1755867800000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3725745"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":78,"alternative-id":["10.1145\/3721145.3725745","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3725745","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}