{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:14Z","timestamp":1755870014000,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["RS-2023-00222663"],"award-info":[{"award-number":["RS-2023-00222663"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010418","name":"Institute for Information and communications Technology Promotion","doi-asserted-by":"publisher","award":["2018-0-00581"],"award-info":[{"award-number":["2018-0-00581"]}],"id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]},{"name":"BK21 FOUR Intelligence Computing","award":["4199990214639"],"award-info":[{"award-number":["4199990214639"]}]},{"DOI":"10.13039\/501100014188","name":"Ministry of Science and ICT, South Korea","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100014188","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Samsung Display Co., Ltd"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3734531","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"426-441","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SnuSOLVER: Optimizing Sparse Direct Solvers for Heterogeneous Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-9042-0435","authenticated-orcid":false,"given":"Chaewon","family":"Kim","sequence":"first","affiliation":[{"name":"Department of Computer Science and Engineering, Seoul National University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7485-9277","authenticated-orcid":false,"given":"Jaehwan","family":"Lee","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Seoul National University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7775-3543","authenticated-orcid":false,"given":"Jinpyo","family":"Kim","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Seoul National University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0809-8885","authenticated-orcid":false,"given":"Dohyun","family":"Kim","sequence":"additional","affiliation":[{"name":"Institute of Computer Technology, Seoul National University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3548-833X","authenticated-orcid":false,"given":"Kyusu","family":"Ahn","sequence":"additional","affiliation":[{"name":"Research Center, Samsung Display Co., Ltd., Yongin, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1619-2469","authenticated-orcid":false,"given":"Hyung Uk","family":"Cho","sequence":"additional","affiliation":[{"name":"Research Center, Samsung Display Co., Ltd., Yongin, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5434-2574","authenticated-orcid":false,"given":"Seungin","family":"Baek","sequence":"additional","affiliation":[{"name":"Research Center, Samsung Display Co., Ltd., Yongin, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4638-8170","authenticated-orcid":false,"given":"Jaejin","family":"Lee","sequence":"additional","affiliation":[{"name":"Dept. of Data Science, Dept. of Computer Science and Engineering, Seoul National University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_2_2","first-page":"121","volume-title":"International Workshop on Applied Parallel Computing","author":"Amestoy Patrick\u00a0R","year":"2000","unstructured":"Patrick\u00a0R Amestoy, Iain\u00a0S Duff, Jean-Yves L\u2019Excellent, and Jacko Koster. 2000. MUMPS: a general purpose distributed memory sparse solver. In International Workshop on Applied Parallel Computing. Springer, 121\u2013130."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","unstructured":"Hartwig Anzt Edmond Chow and Jack Dongarra. 2018. ParILUT\u2014A New Parallel Threshold ILU Factorization. SIAM Journal on Scientific Computing 40 4 (2018) C503\u2013C519. 10.1137\/16M1079506","DOI":"10.1137\/16M1079506"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"C.\u00a0Cleveland Ashcraft Roger\u00a0G. Grimes John\u00a0Gregg Lewis Barry\u00a0W. Peyton Horst\u00a0D. Simon and Petter\u00a0E. Bj\u00f8rstad. 1987. Progress in Sparse Matrix Methods for Large Linear Systems On Vector Supercomputers. International Journal of High Performance Computing Applications 1 (1987) 10 \u2013 30. https:\/\/api.semanticscholar.org\/CorpusID:62698847","DOI":"10.1177\/109434208700100403"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-43736-7_1"},{"key":"e_1_3_3_1_6_2","first-page":"445","volume-title":"Proceedings of the Sixth SIAM Conference on Parallel Processing for Scientific Computing, PP 1993, Norfolk, Virginia, USA, March 22-24, 1993","author":"Bui Thang\u00a0Nguyen","year":"1993","unstructured":"Thang\u00a0Nguyen Bui and Curt Jones. 1993. A Heuristic for Reducing Fill-In in Sparse Matrix Factorization. In Proceedings of the Sixth SIAM Conference on Parallel Processing for Scientific Computing, PP 1993, Norfolk, Virginia, USA, March 22-24, 1993, Richard\u00a0F. Sincovec, David\u00a0E. Keyes, Michael\u00a0R. Leuze, Linda\u00a0R. Petzold, and Daniel\u00a0A. Reed (Eds.). SIAM, 445\u2013452."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","unstructured":"Timothy\u00a0A. Davis and Yifan Hu. 2011. The university of Florida sparse matrix collection. ACM Trans. Math. Softw. 38 1 Article 1 (dec 2011) 25\u00a0pages. 10.1145\/2049662.2049663","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","unstructured":"Timothy\u00a0A. Davis and Ekanathan Palamadai\u00a0Natarajan. 2010. Algorithm 907: KLU A Direct Sparse Solver for Circuit Simulation Problems. ACM Trans. Math. Softw. 37 3 (2010). 10.1145\/1824801.1824814","DOI":"10.1145\/1824801.1824814"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","unstructured":"James\u00a0W. Demmel Stanley\u00a0C. Eisenstat John\u00a0R. Gilbert Xiaoye\u00a0S. Li and Joseph W.\u00a0H. Liu. 1999. A Supernodal Approach to Sparse Partial Pivoting. SIAM J. Matrix Anal. Appl. 20 3 (1999) 720\u2013755. 10.1137\/S0895479895291765 arXiv:10.1137\/S0895479895291765","DOI":"10.1137\/S0895479895291765"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Inderjit\u00a0S Dhillon Yuqiang Guan and Brian Kulis. 2007. Weighted graph cuts without eigenvectors a multilevel approach. IEEE transactions on pattern analysis and machine intelligence 29 11 (2007) 1944\u20131957.","DOI":"10.1109\/TPAMI.2007.1115"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","unstructured":"Jack Dongarra Piotr Luszczek and Antoine Petitet. 2003. The LINPACK Benchmark: past present and future. Concurrency and Computation: Practice and Experience 15 (08 2003) 803\u2013820. 10.1002\/cpe.728","DOI":"10.1002\/cpe.728"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","unstructured":"I.\u00a0S. Duff and J.\u00a0K. Reid. 1983. The Multifrontal Solution of Indefinite Sparse Symmetric Linear. ACM Trans. Math. Softw. 9 3 (sep 1983) 302\u2013325. 10.1145\/356044.356047","DOI":"10.1145\/356044.356047"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2018.00101"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607050"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","unstructured":"Alan George. 1973. Nested Dissection of a Regular Finite Element Mesh. SIAM J. Numer. Anal. 10 2 (1973) 345\u2013363. 10.1137\/0710032","DOI":"10.1137\/0710032"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","unstructured":"Pieter Ghysels and Ryan Synk. 2022. High performance sparse multifrontal solvers on modern GPUs. Parallel Comput. 110 (2022) 102897. 10.1016\/j.parco.2022.102897","DOI":"10.1016\/j.parco.2022.102897"},{"key":"e_1_3_3_1_17_2","volume-title":"Using MPI: portable parallel programming with the message-passing interface","author":"Gropp William","year":"1999","unstructured":"William Gropp, Ewing Lusk, and Anthony Skjellum. 1999. Using MPI: portable parallel programming with the message-passing interface. Vol.\u00a01. MIT press."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","unstructured":"Kai He Sheldon X.\u00a0D. Tan Hai Wang and Guoyong Shi. 2016. GPU-Accelerated Parallel Sparse LU Factorization Method for Fast Circuit Analysis. IEEE Transactions on Very Large Scale Integration (VLSI) Systems 24 3 (2016) 1140\u20131150. 10.1109\/TVLSI.2015.2421287","DOI":"10.1109\/TVLSI.2015.2421287"},{"key":"e_1_3_3_1_19_2","unstructured":"Intel. 2024. Intel OneAPI Math Kernel Library (MKL). https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/tools\/oneapi\/onemkl.html Accessed: 2024-07-30."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","unstructured":"George Karypis and Vipin Kumar. 1998. A Fast and High Quality Multilevel Scheme for Partitioning Irregular Graphs. SIAM Journal on Scientific Computing 20 1 (1998) 359\u2013392. 10.1137\/S1064827595287997","DOI":"10.1137\/S1064827595287997"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","unstructured":"George Karypis and Vipin Kumar. 1998. A Parallel Algorithm for Multilevel Graph Partitioning and Sparse Matrix Ordering. J. Parallel and Distrib. Comput. 48 1 (1998) 71\u201395. 10.1006\/jpdc.1997.1403","DOI":"10.1006\/jpdc.1997.1403"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532370"},{"key":"e_1_3_3_1_23_2","volume-title":"A Hybrid GPU-CPU Parallel CM Reordering Algorithm for Bandwidth Reduction of Large Sparse Matrices","author":"Li Ang","year":"2015","unstructured":"Ang Li, Radu Serban, and Dan Negrut. 2015. A Hybrid GPU-CPU Parallel CM Reordering Algorithm for Bandwidth Reduction of Large Sparse Matrices. Technical Report."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","unstructured":"Xiaoye\u00a0S. Li and James\u00a0W. Demmel. 2003. SuperLU_DIST: A Scalable Distributed-Memory Sparse Direct Solver for Unsymmetric Linear Systems. ACM Trans. Math. Softw. 29 2 (jun 2003) 110\u2013140. 10.1145\/779359.779361","DOI":"10.1145\/779359.779361"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","unstructured":"Xiaoye\u00a0S. Li Paul Lin Yang Liu and Piyush Sao. 2023. Newly Released Capabilities in the Distributed-Memory SuperLU Sparse Direct Solver. ACM Trans. Math. Softw. 49 1 Article 10 (mar 2023) 20\u00a0pages. 10.1145\/3577197","DOI":"10.1145\/3577197"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","unstructured":"Richard\u00a0J. Lipton Donald\u00a0J. Rose and Robert\u00a0Endre Tarjan. 1979. Generalized Nested Dissection. SIAM J. Numer. Anal. 16 2 (1979) 346\u2013358. 10.1137\/0716027","DOI":"10.1137\/0716027"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","unstructured":"Murat Manguoglu. 2011. A domain-decomposing parallel sparse linear system solver. J. Comput. Appl. Math. 236 3 (2011) 319\u2013325. 10.1016\/j.cam.2011.07.017Aspects of Numerical Algorithms Parallelization and Applications.","DOI":"10.1016\/j.cam.2011.07.017"},{"key":"e_1_3_3_1_28_2","unstructured":"NVIDIA. 2024. cuBLAS. https:\/\/developer.nvidia.com\/cublas Accessed: 2024-07-30."},{"key":"e_1_3_3_1_29_2","unstructured":"NVIDIA. 2024. cuSOLVER. https:\/\/developer.nvidia.com\/cusolver Accessed: 2024-07-30."},{"key":"e_1_3_3_1_30_2","unstructured":"NVIDIA. 2024. GPUDirect. https:\/\/developer.nvidia.com\/gpudirect Accessed: 2024-07-30."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611976465.31"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","unstructured":"Shaoyi Peng and Sheldon X.-D. Tan. 2020. GLU3.0: Fast GPU-based Parallel Sparse LU Factorization for Circuit Simulation. IEEE Design & Test 37 3 (2020) 78\u201390. 10.1109\/MDAT.2020.2974910","DOI":"10.1109\/MDAT.2020.2974910"},{"key":"e_1_3_3_1_33_2","unstructured":"Antoine Petitet R.\u00a0Clint Whaley Jack\u00a0J. Dongarra and Andy Cleary. 2004. HPL-A portable implementation of the high-performance Linpack benchmark for distributed-memory computers. http:\/\/www. netlib. org\/benchmark\/hpl\/ (2004)."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","unstructured":"Andrey Petrushov and Boris Krasnopolsky. 2023. Automated tuning for the parameters of linear solvers. J. Comput. Phys. 494 (2023) 112533. 10.1016\/j.jcp.2023.112533","DOI":"10.1016\/j.jcp.2023.112533"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","unstructured":"Alex Pothen Horst\u00a0D. Simon and Kang-Pu Liou. 1990. Partitioning Sparse Matrices with Eigenvectors of Graphs. SIAM J. Matrix Anal. Appl. 11 3 (1990) 430\u2013452. 10.1137\/0611030","DOI":"10.1137\/0611030"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Fran\u00e7ois-Henry Rouet Xiaoye\u00a0S Li Pieter Ghysels and Artem Napov. 2016. A distributed-memory package for dense hierarchically semi-separable matrix computations using randomization. ACM Transactions on Mathematical Software (TOMS) 42 4 (2016) 1\u201335.","DOI":"10.1145\/2930660"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2018.00100"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-09873-9_41"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247767"},{"key":"e_1_3_3_1_40_2","unstructured":"Buse Yilmaz. 2021. Graph Transformation and Specialized Code Generation For Sparse Triangular Solve (SpTRSV). arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2103.11445 (2021)."}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3734531","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:04:28Z","timestamp":1755867868000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3734531"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":39,"alternative-id":["10.1145\/3721145.3734531","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3734531","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}