{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:13Z","timestamp":1755870013782,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3730411","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"412-425","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["An Efficient 2D Fusion Method for High-Performance Two-Stage Eigensolvers on Modern Heterogeneous Architectures"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0064-4327","authenticated-orcid":false,"given":"Yongxiao","family":"Zhou","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7179-6593","authenticated-orcid":false,"given":"Yi","family":"Zong","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2358-3395","authenticated-orcid":false,"given":"Yuyang","family":"Jin","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6824-5535","authenticated-orcid":false,"given":"Heng","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9740-6581","authenticated-orcid":false,"given":"Wei","family":"Xue","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China; Qinghai University, Xining, China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Advanced Micro Devices Inc. 2024. AMD matrix cores. https:\/\/rocm.blogs.amd.com\/software-tools-optimization\/matrix-cores\/README.html Accessed: 2024-11-25."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.5555\/323215"},{"key":"e_1_3_3_1_4_2","unstructured":"Edward Anderson Jack\u00a0J. Dongarra and Susan Ostrouchov. 1992. LAPACK Working Note 41: Installation Guide for LAPACK. https:\/\/api.semanticscholar.org\/CorpusID:16481322"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"T. Auckenthaler V. Blum H.-J. Bungartz T. Huckle R. Johanni L. Kr\u00e4mer B. Lang H. Lederer and P.R. Willems. 2011. Parallel Solution of Partial Symmetric Eigenvalue Problems from Electronic Structure Calculations. 37 12 (2011) 783\u2013794. https:\/\/doi.org\/10.1016\/j.parco.2011.05.002","DOI":"10.1016\/j.parco.2011.05.002"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"M Be\u010dka G Ok\u0161a and M Vajter\u0161ic. 2002. Dynamic Ordering for a Parallel Block-Jacobi SVD Algorithm. Parallel Comput. 28 2 (Feb. 2002) 243\u2013262. https:\/\/doi.org\/10.1016\/S0167-8191(01)00138-7","DOI":"10.1016\/S0167-8191(01)00138-7"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Martin Be\u010dka Gabriel Ok\u0161a and Mari\u00e1n Vajter\u0161ic. 2015. New dynamic orderings for the parallel one\u2013sided block-Jacobi SVD algorithm. Parallel Processing Letters 25 02 (2015) 1550003.","DOI":"10.1142\/S0129626415500036"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"MARTIN BE\u010cKA and Mari\u00e1n Vajter\u0161ic. 1999. Block-Jacobi SVD algorithms for distributed memory systems I: Hypercubes and rings. Parallel Algorithms and Application 13 3 (1999) 265\u2013287.","DOI":"10.1080\/10637199808947370"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"David Bindel James Demmel William Kahan and Osni Marques. 2002. On computing Givens rotations reliably and efficiently. ACM Transactions on Mathematical Software (TOMS) 28 2 (2002) 206\u2013238.","DOI":"10.1145\/567806.567809"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/SHPCC.1994.296622"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Christian\u00a0H. Bischof Bruno Lang and Xiaobai Sun. 2000. Algorithm 807: The SBR Toolbox\u2014Software for Successive Band Reduction. 26 4 (2000) 602\u2013616. https:\/\/doi.org\/10.1145\/365723.365736","DOI":"10.1145\/365723.365736"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Christian\u00a0H. Bischof Bruno Lang and Xiaobai Sun. 2000. Algorithm 807: The SBR Toolbox\u2014Software for Successive Band Reduction. ACM Trans. Math. Software 26 4 (Dec. 2000) 602\u2013616. https:\/\/doi.org\/10.1145\/365723.365736","DOI":"10.1145\/365723.365736"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"L\u00a0Susan Blackford Antoine Petitet Roldan Pozo Karin Remington R\u00a0Clint Whaley James Demmel Jack Dongarra Iain Duff Sven Hammarling Greg Henry et\u00a0al. 2002. An updated set of basic linear algebra subprograms (BLAS). ACM Trans. Math. Software 28 2 (2002) 135\u2013151.","DOI":"10.1145\/567806.567807"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Thomas\u00a0E Booth. 2006. Power iteration method for the several largest eigenvalues and eigenfunctions. Nuclear science and engineering 154 1 (2006) 48\u201362.","DOI":"10.13182\/NSE05-05"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"James Demmel and Kre\u0161imir Veseli\u0107. 1992. Jacobi\u2019s method is more accurate than QR. SIAM journal on matrix analysis and applications 13 4 (1992) 1204\u20131245.","DOI":"10.1137\/0613074"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Inderjit\u00a0S Dhillon and Beresford\u00a0N Parlett. 2004. Multiple representations to compute orthogonal eigenvectors of symmetric tridiagonal matrices. Linear Algebra Appl. 387 (2004) 1\u201328.","DOI":"10.1016\/j.laa.2003.12.028"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Jack Dongarra Mark Gates Azzam Haidar Jakub Kurzak Piotr Luszczek Stanimire Tomov and Ichitaro Yamazaki. 2018. The singular value decomposition: Anatomy of optimizing an algorithm for extreme scale. SIAM review 60 4 (2018) 808\u2013865.","DOI":"10.1137\/17M1117732"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Jack\u00a0J Dongarra Danny\u00a0C Sorensen and Sven\u00a0J Hammarling. 1989. Block reduction of matrices to condensed forms for eigenvalue computations. J. Comput. Appl. Math. 27 1-2 (1989) 215\u2013227.","DOI":"10.1016\/0377-0427(89)90367-1"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Augustin\u00a0A Dubrulle. 2000. Householder transformations revisited. SIAM J. Matrix Anal. Appl. 22 1 (2000) 33\u201340.","DOI":"10.1137\/S0895479898338561"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Patricia\u00a0J. Eberlein and Haesun Park. 1990. Efficient implementation of Jacobi algorithms and Jacobi sets on distributed memory architectures. J. Parallel and Distrib. Comput. 8 4 (1990) 358\u2013366.","DOI":"10.1016\/0743-7315(90)90134-B"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2015.128"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Jiangang Gao Fang Zheng Fengbin Qi Yajun Ding Hongliang Li Hongsheng Lu Wangquan He Hongmei Wei Lifeng Jin Xin Liu et\u00a0al. 2021. Sunway supercomputer architecture towards exascale computing: analysis and practice. Science China Information Sciences 64 4 (2021) 141101.","DOI":"10.1007\/s11432-020-3104-7"},{"key":"e_1_3_3_1_23_2","unstructured":"Weiguo Gao Yuxin Ma and Meiyue Shao. 2022. A mixed precision Jacobi SVD algorithm. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2209.04626 (2022)."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Ming Gu and Stanley\u00a0C Eisenstat. 1995. A divide-and-conquer algorithm for the bidiagonal SVD. SIAM J. Matrix Anal. Appl. 16 1 (1995) 79\u201392.","DOI":"10.1137\/S0895479892242232"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/2503210.2503292"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/2503210.2503292"},{"key":"e_1_3_3_1_27_2","unstructured":"A Haidar S Tomov I Yamazaki R Solca T Schulthess T Dong and J Dongarra. 2008. Magma: A breakthrough in solvers for eigenvalue problems."},{"key":"e_1_3_3_1_28_2","unstructured":"Intel Corporation. 2024. Intel MKL Library. http:\/\/software.intel.com\/en-us\/articles\/intel-mkl\/ Accessed: 2024-11-25."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607031"},{"key":"e_1_3_3_1_30_2","volume-title":"Krylov subspace methods: principles and analysis","author":"Liesen J\u00f6rg","year":"2013","unstructured":"J\u00f6rg Liesen and Zdenek Strakos. 2013. Krylov subspace methods: principles and analysis. Numerical Mathematics and Scie."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Fangfang Liu Wenjing Ma Yuwen Zhao Daokun Chen Yi Hu Qinglin Lu WanWang Yin Xinhui Yuan Lijuan Jiang Hao Yan et\u00a0al. 2023. xmath2. 0: a high-performance extended math library for sw26010-pro many-core processor. CCF Transactions on High Performance Computing 5 1 (2023) 56\u201371.","DOI":"10.1007\/s42514-022-00126-8"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2011.91"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Andreas Marek Volker Blum Rainer Johanni Ville Havu Bruno Lang Thomas Auckenthaler Alexander Heinecke Hans-Joachim Bungartz and Hermann Lederer. 2014. The ELPA library: scalable parallel eigenvalue solutions for electronic structure theory and computational science. Journal of Physics: Condensed Matter 26 21 (2014) 213201.","DOI":"10.1088\/0953-8984\/26\/21\/213201"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-61982-8_8"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Per-Gunnar Martinsson Gregorio Quintana\u00a0Ort\u00cd Nathan Heavner and Robert Van De\u00a0Geijn. 2017. Householder QR factorization with randomization for column pivoting (HQRRP). SIAM Journal on Scientific Computing 39 2 (2017) C96\u2013C115.","DOI":"10.1137\/16M1081270"},{"key":"e_1_3_3_1_36_2","unstructured":"NVIDIA Corporation. 2024. cuSOLVER. https:\/\/docs.nvidia.com\/cuda\/cusolver\/index.html\/ Accessed: 2024-11-25."},{"key":"e_1_3_3_1_37_2","unstructured":"NVIDIA Corporation. 2024. NVIDIA Tensor Cores. https:\/\/www.nvidia.com\/en-us\/data-center\/tensor-cores\/ Accessed: 2024-11-25."},{"key":"e_1_3_3_1_38_2","unstructured":"NVIDIA Corporation. 2024. TensorFloat-32 in the A100 GPU Accelerates AI Training HPC up to 20x. https:\/\/blogs.nvidia.com\/blog\/tensorfloat-32-precision-format\/ Accessed: 2024-11-25."},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"crossref","unstructured":"Beresford\u00a0N Parlett and David\u00a0S Scott. 1979. The Lanczos algorithm with selective orthogonalization. Mathematics of computation 33 145 (1979) 217\u2013238.","DOI":"10.1090\/S0025-5718-1979-0514820-3"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"David Patterson Thomas Anderson Neal Cardwell Richard Fromm Kimberly Keeton Christoforos Kozyrakis Randi Thomas and Katherine Yelick. 1997. A case for intelligent RAM. IEEE micro 17 2 (1997) 34\u201344.","DOI":"10.1109\/40.592312"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"crossref","unstructured":"Melven R\u00f6hrig-Z\u00f6llner Jonas Thies Moritz Kreutzer Andreas Alvermann Andreas Pieper Achim Basermann Georg Hager Gerhard Wellein and Holger Fehske. 2015. Increasing the performance of the Jacobi\u2013Davidson method by blocking. SIAM Journal on Scientific Computing 37 6 (2015) C697\u2013C722.","DOI":"10.1137\/140976017"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"Robert Schreiber and Charles Van\u00a0Loan. 1989. A storage-efficient WY representation for products of Householder transformations. SIAM J. Sci. Statist. Comput. 10 1 (1989) 53\u201357.","DOI":"10.1137\/0910005"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"crossref","unstructured":"Stanimire Tomov Jack Dongarra and Marc Baboulin. 2010. Towards dense linear algebra for hybrid GPU accelerated manycore systems. Parallel Comput. 36 5-6 (June 2010) 232\u2013240. https:\/\/doi.org\/10.1016\/j.parco.2009.12.005","DOI":"10.1016\/j.parco.2009.12.005"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/ScalAH56622.2022.00011"},{"key":"e_1_3_3_1_45_2","unstructured":"University of Tennessee (US). 2024. Matrix Algebra on GPU and Multi-core Architectures. https:\/\/icl.utk.edu\/magma\/ Accessed: 2024-11-25."},{"key":"e_1_3_3_1_46_2","unstructured":"University of Tennessee (US) University of Manchester (UK). 2024. Parallel Linear Algebra Software for Multicore Architectures. https:\/\/github.com\/icl-utk-edu\/plasma\/ Accessed: 2024-11-25."},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.5555\/1413370.1413437"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"crossref","unstructured":"Samuel Williams Andrew Waterman and David Patterson. 2009. Roofline: an insightful visual performance model for multicore architectures. Commun. ACM 52 4 (2009) 65\u201376.","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_3_1_49_2","volume-title":"QUARK Users\u2019 Guide: Queuing And Runtime for Kernels, Version 1.0","author":"YarKhan Asim","year":"2011","unstructured":"Asim YarKhan, Jakub Kurzak, and Jack Dongarra. 2011. QUARK Users\u2019 Guide: Queuing And Runtime for Kernels, Version 1.0. technical report UT-ICL-11-02. University of Tennessee Innovative Computing Laboratory, Knoxville, Tennessee 37996."},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"crossref","unstructured":"Victor Wen-zhe Yu Jonathan Moussa Pavel K\u016fs Andreas Marek Peter Messmer Mina Yoon Hermann Lederer and Volker Blum. 2021. GPU-Acceleration of the ELPA2 Distributed Eigensolver for Dense Symmetric and Hermitian Eigenproblems. 262 (2021) 107808. https:\/\/doi.org\/10.1016\/j.cpc.2020.107808 arXiv:https:\/\/arXiv.org\/abs\/2002.10991\u00a0[physics]","DOI":"10.1016\/j.cpc.2020.107808"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3730411","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:04:11Z","timestamp":1755867851000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3730411"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":49,"alternative-id":["10.1145\/3721145.3730411","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3730411","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}