{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T00:07:24Z","timestamp":1755907644728,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":31,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100007085","name":"National University of Defense Technology","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100007085","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3725756","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"135-148","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["IA-Chol: Input-Aware Cholesky Decomposition on CPU and GPU"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8080-0893","authenticated-orcid":false,"given":"Jixiao","family":"Deng","sequence":"first","affiliation":[{"name":"National University of Defense Technology, Laboratory of Digitizing Software for Frontier Equipment, Changsha, Hunan, China; National University of Defense Technology, National Key Laboratory of Parallel and Distributed Computing, Changsha, Hunan, China and National University of Defense Technology, College of Computer Science and Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8286-6566","authenticated-orcid":false,"given":"Qinglin","family":"Wang","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Laboratory of Digitizing Software for Frontier Equipment, Changsha, Hunan, China; National University of Defense Technology, National Key Laboratory of Parallel and Distributed Computing, Changsha, Hunan, China and National University of Defense Technology, College of Computer Science and Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6908-8472","authenticated-orcid":false,"given":"Lin","family":"Chen","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Laboratory of Digitizing Software for Frontier Equipment, Changsha, Hunan, China; National University of Defense Technology, National Key Laboratory of Parallel and Distributed Computing, Changsha, Hunan, China and National University of Defense Technology, College of Computer Science and Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7498-3909","authenticated-orcid":false,"given":"Tun","family":"Li","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, College of Computer Science and Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0058-2350","authenticated-orcid":false,"given":"Bo","family":"Yang","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Laboratory of Digitizing Software for Frontier Equipment, Changsha, Hunan, China; National University of Defense Technology, National Key Laboratory of Parallel and Distributed Computing, Changsha, Hunan, China and National University of Defense Technology, College of Computer Science and Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2931-4893","authenticated-orcid":false,"given":"Xinhai","family":"Chen","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Laboratory of Digitizing Software for Frontier Equipment, Changsha, Hunan, China; National University of Defense Technology, National Key Laboratory of Parallel and Distributed Computing, Changsha, Hunan, China and National University of Defense Technology, College of Computer Science and Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3745-7541","authenticated-orcid":false,"given":"Jie","family":"Liu","sequence":"additional","affiliation":[{"name":"National University Of Defense Technology, Laboratory of Digitizing Software for Frontier Equipment, Changsha, Hunan, China; National University Of Defense Technology, National Key Laboratory of Parallel and Distributed Computing, Changsha, Hunan, China and National University Of Defense Technology, College of Computer Science and Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.5555\/646665.699419"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Bjarne\u00a0Stig Andersen Jerzy Wa\u015bniewski and Fred\u00a0G. Gustavson. 2001. A recursive formulation of Cholesky factorization of a matrix in packed storage. ACM Trans. Math. Software 27 2 (2001) 214\u2013244. https:\/\/doi.org\/10.1145\/383738.383741","DOI":"10.1145\/383738.383741"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Peter Benner Pablo Ezzatti Daniel Kressner Enrique\u00a0S. Quintana-Ort\u00ed and Alfredo Rem\u00f3n. 2011. A mixed-precision algorithm for the solution of Lyapunov equations on hybrid CPU-GPU platforms. Parallel Comput. 37 8 (2011) 439\u2013450.","DOI":"10.1016\/j.parco.2010.12.002"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394277.3401846"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Erin\u00a0C. Carson. 2018. The Adaptive s-Step Conjugate Gradient Method. SIAM J. Matrix Anal. Appl. 39 3 (2018) 1318\u20131338. https:\/\/doi.org\/10.1137\/16M1077892","DOI":"10.1137\/16M1107942"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Erin\u00a0C. Carson Tom\u00e1s Gergelits and Ichitaro Yamazaki. 2022. Mixed precision s-step Lanczos and conjugate gradient algorithms. Numerical Linear Algebra with Applications 29 3 (2022). https:\/\/doi.org\/10.1002\/nla.2487","DOI":"10.1002\/nla.2425"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Ray-Bing Chen Yaohung\u00a0M. Tsai and Weichung Wang. 2014. Adaptive block size for dense QR factorization in hybrid CPU-GPU systems via statistical modeling. Parallel Comput. 40 5-6 (2014) 70\u201385.","DOI":"10.1016\/j.parco.2014.03.001"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Terry Cojean Abdou Guermouche Andra Hugo Raymond Namyst and Pierre-Andr\u00e9 Wacrenier. 2019. Resource aggregation for task-based Cholesky Factorization on top of modern architectures. Parallel Comput. 83 (2019) 73\u201392.","DOI":"10.1016\/j.parco.2018.10.007"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Terry Cojean Abdou Guermouche Andra Hugo Raymond Namyst and Pierre-Andr\u00e9 Wacrenier. 2019. Resource aggregation for task-based Cholesky Factorization on top of modern architectures. Parallel Comput. 83 (2019) 73\u201392.","DOI":"10.1016\/j.parco.2018.10.007"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"T.\u00a0A. Davis S. Rajamanickam and W.\u00a0M. Sid-Lakhdar. 2016. A survey of direct methods for sparse linear systems. Acta Numer. 25 (2016) 383\u2013566.","DOI":"10.1017\/S0962492916000076"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/2132876.2132885"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.56021\/9781421407944"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"F.\u00a0G. Gustavson and I. Jonsson. 2000. Minimal-storage high-performance Cholesky factorization via blocking and recursion. IBM Journal of Research and Development 44 6 (2000) 823\u2013850. https:\/\/doi.org\/10.1147\/rd.446.0823","DOI":"10.1147\/rd.446.0823"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Azzam Haidar Ahmad Abdelfattah Mawussi Zounon Stanimire Tomov and Jack\u00a0J. Dongarra. 2018. A Guide for Achieving High Performance with Very Small Matrices on GPU: A Case Study of Batched LU and Cholesky Factorizations. IEEE Transactions on Parallel and Distributed Systems 29 5 (2018) 973\u2013984.","DOI":"10.1109\/TPDS.2017.2783929"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511811685"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Jakub Kurzak Hartwig Anzt Mark Gates and Jack\u00a0J. Dongarra. 2016. Implementation and Tuning of Batched Cholesky Factorization and Solve for NVIDIA GPUs. IEEE Transactions on Parallel and Distributed Systems 27 7 (2016) 2036\u20132048.","DOI":"10.1109\/TPDS.2015.2481890"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Yuechen Lu Yuchen Luo Haocheng Lian Zhou Jin and Weifeng Liu. 2021. Implementing LU and Cholesky factorizations on artificial intelligence accelerators. CCF Transactions on High Performance Computing 3 3 (2021) 286\u2013297.","DOI":"10.1007\/s42514-021-00075-8"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Yuechen Lu Yuchen Luo Haocheng Lian Zhou Jin and Weifeng Liu. 2021. Implementing LU and Cholesky factorizations on artificial intelligence accelerators. CCF Transactions on High Performance Computing 3 3 (2021) 286\u2013297.","DOI":"10.1007\/s42514-021-00075-8"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Sparsh Mittal and Jeffrey\u00a0S. Vetter. 2015. A Survey of CPU-GPU Heterogeneous Computing Techniques. Comput. Surveys 47 4 (2015) 69:1\u201369:35.","DOI":"10.1145\/2788396"},{"key":"e_1_3_3_1_21_2","unstructured":"Ali Mohammadjafari and Poorya Khajouie. 2024. Optimizing Task Scheduling in Heterogeneous Computing Environments: A Comparative Analysis of CPU GPU and ASIC Platforms Using E2C Simulator. CoRR abs\/2405.08187 (2024)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3624253"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Matthias\u00a0W. Seeger. 2004. Gaussian Processes For Machine Learning. International Journal of Neural Systems 14 2 (2004) 69\u2013106. https:\/\/api.semanticscholar.org\/CorpusID:63955376","DOI":"10.1142\/S0129065704001899"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Sailes\u00a0K. Sengijpta. 1993. Fundamentals of Statistical Signal Processing: Estimation Theory. Technometrics 37 4 (1993) 465\u2013466.","DOI":"10.1080\/00401706.1995.10484391"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Lino\u00a0M. Silva and Aurelio R.\u00a0L. Oliveira. 2021. Modified controlled Cholesky factorization for preconditioning linear systems from the interior-point method. Comput. Appl. Math. 40 4 (2021).","DOI":"10.1007\/s40314-021-01544-0"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"crossref","unstructured":"Yuki Tsujita and Toshio Endo. 2015. Data Driven Scheduling Approach for the Multi-node Multi-GPU Cholesky Decomposition. Journal of Supercomputing: Special Issue on High Performance Computing for Scientific Applications (2015) 69\u201382.","DOI":"10.1007\/978-3-319-61756-5_4"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/1542275.1542312"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Xiaoqun Wang and Ian\u00a0H. Sloan. 2011. Quasi-Monte Carlo Methods in Financial Engineering: An Equivalence Principle and Dimension Reduction. Operations Research 59 1 (2011) 80\u201395.","DOI":"10.1287\/opre.1100.0853"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER.2010.12"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"crossref","unstructured":"Depeng Yang Gregory\u00a0D. Peterson and Husheng Li. 2012. Compressed sensing and Cholesky decomposition on FPGAs and GPUs. Parallel Comput. 38 8 (2012) 421\u2013437.","DOI":"10.1016\/j.parco.2012.03.001"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE51399.2021.00169"},{"key":"e_1_3_3_1_32_2","volume-title":"The Finite Element Method for Solid and Structural Mechanics (6th ed.)","author":"Zienkiewicz O.\u00a0C.","year":"2005","unstructured":"O.\u00a0C. Zienkiewicz and R.\u00a0L. Taylor. 2005. The Finite Element Method for Solid and Structural Mechanics (6th ed.)."}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3725756","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:59Z","timestamp":1755867479000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3725756"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":31,"alternative-id":["10.1145\/3721145.3725756","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3725756","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}