{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,19]],"date-time":"2025-04-19T04:06:07Z","timestamp":1745035567487,"version":"3.40.4"},"reference-count":50,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,3,23]],"date-time":"2025-03-23T00:00:00Z","timestamp":1742688000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,3,23]],"date-time":"2025-03-23T00:00:00Z","timestamp":1742688000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["22333003"],"award-info":[{"award-number":["22333003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. HPC"],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1007\/s42514-024-00202-1","type":"journal-article","created":{"date-parts":[[2025,3,23]],"date-time":"2025-03-23T12:35:38Z","timestamp":1742733338000},"page":"129-141","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Mixed precision SpMV on GPUs for irregular data with hierarchical precision selection"],"prefix":"10.1007","volume":"7","author":[{"given":"Jianfei","family":"Xu","sequence":"first","affiliation":[]},{"given":"Lianhua","family":"He","sequence":"additional","affiliation":[]},{"given":"Zhong","family":"Jin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,23]]},"reference":[{"issue":"9","key":"202_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3477141","volume":"54","author":"S Abadal","year":"2021","unstructured":"Abadal, S., Jain, A., Guirado, R., L\u00f3pez-Alonso, J., Alarc\u00f3n, E.: Computing graph neural networks: A survey from algorithms to accelerators. ACM Computing Surveys (CSUR) 54(9), 1\u201338 (2021)","journal-title":"ACM Computing Surveys (CSUR)"},{"issue":"4","key":"202_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3371275","volume":"16","author":"K Ahmad","year":"2019","unstructured":"Ahmad, K., Sundar, H., Hall, M.: Data-driven mixed precision sparse matrix vector multiplication for GPUs. ACM Trans. Arch. Code Optim. (TACO) 16(4), 1\u201324 (2019)","journal-title":"ACM Trans. Arch. Code Optim. (TACO)"},{"issue":"4","key":"202_CR3","doi-asserted-by":"publisher","first-page":"2198","DOI":"10.1093\/imanum\/drac037","volume":"43","author":"P Amestoy","year":"2023","unstructured":"Amestoy, P., Boiteau, O., Buttari, A., Gerest, M., J\u00e9z\u00e9quel, F., L\u2019Excellent, J.-Y., Mary, T.: Mixed precision low-rank approximations and their application to block low-rank LU factorization. IMA J. Numer. Anal. 43(4), 2198\u20132227 (2023)","journal-title":"IMA J. Numer. Anal."},{"issue":"6","key":"202_CR4","doi-asserted-by":"publisher","first-page":"4460","DOI":"10.1002\/cpe.4460","volume":"31","author":"H Anzt","year":"2019","unstructured":"Anzt, H., Dongarra, J., Flegar, G., Higham, N.J., Quintana-Ort\u00ed, E.S.: Adaptive precision in block-Jacobi preconditioning for iterative sparse linear system solvers. Concurr. Comput. Pract. Exp. 31(6), 4460 (2019)","journal-title":"Concurr. Comput. Pract. Exp."},{"key":"202_CR5","doi-asserted-by":"crossref","unstructured":"Ashari, A., Sedaghati, N., Eisenlohr, J., Parthasarath, S., Sadayappan, P.: Fast sparse matrix-vector multiplication on GPUs for graph applications. In: SC\u201914: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 781\u2013792 (2014). IEEE","DOI":"10.1109\/SC.2014.69"},{"key":"202_CR6","unstructured":"Bhaskaran, R., Collins, L.: Introduction to CFD basics. Cornell University-Sibley School of Mechanical and Aerospace Engineering, 1\u201321 (2002)"},{"issue":"1\u20137","key":"202_CR7","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1016\/S0169-7552(98)00110-X","volume":"30","author":"S Brin","year":"1998","unstructured":"Brin, S., Page, L.: The anatomy of a large-scale hypertextual web search engine. Comput. Netw. ISDN Syst. 30(1\u20137), 107\u2013117 (1998)","journal-title":"Comput. Netw. ISDN Syst."},{"issue":"2","key":"202_CR8","doi-asserted-by":"publisher","first-page":"817","DOI":"10.1137\/17M1140819","volume":"40","author":"E Carson","year":"2018","unstructured":"Carson, E., Higham, N.J.: Accelerating the solution of linear systems by iterative refinement in three precisions. SIAM J. Sci. Comput. 40(2), 817\u2013847 (2018)","journal-title":"SIAM J. Sci. Comput."},{"key":"202_CR9","doi-asserted-by":"crossref","unstructured":"Choquette, J., Gandhi, W.: Nvidia a100 gpu: Performance & innovation for gpu computing. In: 2020 IEEE Hot Chips 32 Symposium (HCS), pp. 1\u201343 (2020). IEEE Computer Society","DOI":"10.1109\/HCS49909.2020.9220622"},{"key":"202_CR10","doi-asserted-by":"crossref","unstructured":"Daga, M., Greathouse, J.L.: Structural agnostic SpMV: Adapting CSR-adaptive for irregular matrices. In: 2015 IEEE 22nd International Conference on High Performance Computing (HiPC), pp. 64\u201374 (2015). IEEE","DOI":"10.1109\/HiPC.2015.55"},{"issue":"1","key":"202_CR11","first-page":"1","volume":"38","author":"TA Davis","year":"2011","unstructured":"Davis, T.A., Hu, Y.: The University of Florida sparse matrix collection. ACM Trans. Math. Softw. (TOMS) 38(1), 1\u201325 (2011)","journal-title":"ACM Trans. Math. Softw. (TOMS)"},{"key":"202_CR12","doi-asserted-by":"publisher","first-page":"383","DOI":"10.1017\/S0962492916000076","volume":"25","author":"TA Davis","year":"2016","unstructured":"Davis, T.A., Rajamanickam, S., Sid-Lakhdar, W.M.: A survey of direct methods for sparse linear systems. Acta Numer 25, 383\u2013566 (2016)","journal-title":"Acta Numer"},{"key":"202_CR13","unstructured":"Dettmers, T., Lewis, M., Belkada, Y., Zettlemoyer, L.: Gpt3. int8 (): 8-bit matrix multiplication for transformers at scale. Advances in Neural Information Processing Systems 35, 30318\u201330332 (2022)"},{"key":"202_CR14","unstructured":"Diffenderfer, J., Osei-Kuffuor, D., Menon, H.: QDOT: Quantized dot product kernel for approximate high-performance computing. arXiv preprint arXiv:2105.00115 (2021)"},{"key":"202_CR15","doi-asserted-by":"publisher","first-page":"201","DOI":"10.1007\/s101070100263","volume":"91","author":"ED Dolan","year":"2002","unstructured":"Dolan, E.D., Mor\u00e9, J.J.: Benchmarking optimization software with performance profiles. Math. Program. 91, 201\u2013213 (2002)","journal-title":"Math. Program."},{"issue":"2","key":"202_CR16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3441850","volume":"47","author":"G Flegar","year":"2021","unstructured":"Flegar, G., Anzt, H., Cojean, T., Quintana-Orti, E.S.: Adaptive precision block-Jacobi for high performance preconditioning in the Ginkgo linear algebra software. ACM Transactions on Mathematical Software (TOMS) 47(2), 1\u201328 (2021)","journal-title":"ACM Transactions on Mathematical Software (TOMS)"},{"key":"202_CR17","unstructured":"Gao, J., Liu, B., Ji, W., Huang, H.: A Systematic Literature Survey of Sparse Matrix-Vector Multiplication. arXiv preprint arXiv:2404.06047 (2024)"},{"issue":"1","key":"202_CR18","doi-asserted-by":"publisher","first-page":"30","DOI":"10.1137\/22M1522619","volume":"46","author":"S Graillat","year":"2024","unstructured":"Graillat, S., J\u00e9z\u00e9quel, F., Mary, T., Molina, R.: Adaptive precision sparse matrix-vector product and its application to krylov solvers. SIAM J. Sci. Comput. 46(1), 30\u201356 (2024)","journal-title":"SIAM J. Sci. Comput."},{"key":"202_CR19","unstructured":"Gratton, S., Simon, E., Titley-Peloquin, D., Toint, P.: Exploiting variable precision in GMRES. arXiv preprint arXiv:1907.10550 (2019)"},{"key":"202_CR20","doi-asserted-by":"crossref","unstructured":"Greathouse, J.L., Daga, M.: Efficient sparse matrix-vector multiplication on GPUs using the CSR storage format. In: SC\u201914: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 769\u2013780 (2014). IEEE","DOI":"10.1109\/SC.2014.68"},{"key":"202_CR21","doi-asserted-by":"crossref","unstructured":"Grigora\u015f, P., Burovskiy, P., Luk, W., Sherwin, S.: Optimising Sparse Matrix Vector multiplication for large scale FEM problems on FPGA. In: 2016 26th International Conference on Field Programmable Logic and Applications (FPL), pp. 1\u20139 (2016). IEEE","DOI":"10.1109\/FPL.2016.7577352"},{"key":"202_CR22","unstructured":"Grossman, M., Thiele, C., Araya-Polo, M., Frank, F., Alpak, F.O., Sarkar, V.: A survey of sparse matrix-vector multiplication performance on large matrices. arXiv preprint arXiv:1608.00636 (2016)"},{"issue":"15","key":"202_CR23","doi-asserted-by":"publisher","first-page":"5418","DOI":"10.1002\/cpe.5418","volume":"32","author":"T Gr\u00fctzmacher","year":"2020","unstructured":"Gr\u00fctzmacher, T., Cojean, T., Flegar, G., G\u00f6bel, F., Anzt, H.: A customized precision format based on mantissa segmentation for accelerating sparse linear algebra. Concurrency and Computation: Practice and Experience 32(15), 5418 (2020)","journal-title":"Concurrency and Computation: Practice and Experience"},{"key":"202_CR24","doi-asserted-by":"publisher","DOI":"10.1016\/j.jocs.2022.101609","volume":"61","author":"K Isupov","year":"2022","unstructured":"Isupov, K.: Multiple-precision sparse matrix-vector multiplication on GPUs. Journal of computational science 61, 101609 (2022)","journal-title":"Journal of computational science"},{"key":"202_CR25","doi-asserted-by":"crossref","unstructured":"Jacob, B., Kligys, S., Chen, B., Zhu, M., Tang, M., Howard, A., Adam, H., Kalenichenko, D.: Quantization and training of neural networks for efficient integer-arithmetic-only inference. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2704\u20132713 (2018)","DOI":"10.1109\/CVPR.2018.00286"},{"issue":"94720\u20131776","key":"202_CR26","first-page":"11","volume":"754","author":"W Kahan","year":"1996","unstructured":"Kahan, W.: IEEE standard 754 for binary floating-point arithmetic. Lecture Notes on the Status of IEEE 754(94720\u20131776), 11 (1996)","journal-title":"Lecture Notes on the Status of IEEE"},{"key":"202_CR27","unstructured":"Kouya, T.: A highly efficient implementation of multiple precision sparse matrix-vector multiplication and its application to product-type Krylov subspace methods. arXiv preprint arXiv:1411.2377 (2014)"},{"key":"202_CR28","unstructured":"Kumar, U., Chauhan, V., Choudhary, S.K., Jain, S., Jagga, S.: Single Precision Floating Point Unit (FPU) Based on IEEE 754 Standard Using Verilog. In: IPEC, vol. 1, pp. 11\u201316 (2022)"},{"key":"202_CR29","unstructured":"Kurbanaliev, A., Maksutov, A., Obodoeva, G., Oichueva, B.: Using OpenFOAM multiphase solver interFoam for large scale modeling. In: Proceeding of The World Congress on Engineering and Computer Science, San Francisco, USA, pp. 22\u201324 (2019)"},{"issue":"10","key":"202_CR30","doi-asserted-by":"publisher","first-page":"1371","DOI":"10.1002\/cpe.1164","volume":"19","author":"J Kurzak","year":"2007","unstructured":"Kurzak, J., Dongarra, J.: Implementation of mixed precision in solving systems of linear equations on the CELL processor. Concurrency and Computation: Practice and Experience 19(10), 1371\u20131385 (2007)","journal-title":"Concurrency and Computation: Practice and Experience"},{"key":"202_CR31","doi-asserted-by":"crossref","unstructured":"Leinhauser, M., Young, J., Bastrakov, S., Widera, R., Chatterjee, R., Chandrasekaran, S.: Performance analysis of PIConGPU: particle-in-cell on GPUs using NVIDIA\u2019s NSight systems and NSight compute. Technical report, Oak Ridge National Lab.(ORNL), Oak Ridge, TN (United States) (2021)","DOI":"10.2172\/1761619"},{"key":"202_CR32","doi-asserted-by":"publisher","first-page":"433","DOI":"10.1007\/s11227-016-1887-4","volume":"73","author":"S Lin","year":"2017","unstructured":"Lin, S., Xie, Z.: A Jacobi_PCG solver for sparse linear systems on multi-GPU cluster. J. Supercomput. 73, 433\u2013454 (2017)","journal-title":"J. Supercomput."},{"key":"202_CR33","doi-asserted-by":"crossref","unstructured":"Lindquist, N., Luszczek, P., Dongarra, J.: Improving the performance of the GMRES method using mixed-precision techniques. In: Driving Scientific and Engineering Discoveries Through the Convergence of HPC, Big Data and AI: 17th Smoky Mountains Computational Sciences and Engineering Conference, SMC 2020, Oak Ridge, TN, USA, August 26-28, 2020, Revised Selected Papers 17, pp. 51\u201366 (2020). Springer","DOI":"10.1007\/978-3-030-63393-6_4"},{"key":"202_CR34","doi-asserted-by":"crossref","unstructured":"Liu, W., Vinter, B.: CSR5: An efficient storage format for cross-platform sparse matrix-vector multiplication. In: Proceedings of the 29th ACM on International Conference on Supercomputing, pp. 339\u2013350 (2015)","DOI":"10.1145\/2751205.2751209"},{"key":"202_CR35","doi-asserted-by":"crossref","unstructured":"Loe, J.A., Glusa, C.A., Yamazaki, I., Boman, E.G., Rajamanickam, S.: A study of mixed precision strategies for GMRES on GPUs. arXiv preprint arXiv:2109.01232 (2021)","DOI":"10.1109\/IPDPSW52791.2021.00078"},{"key":"202_CR36","doi-asserted-by":"crossref","unstructured":"Lu, Y., Liu, W.: DASP: Specific Dense Matrix Multiply-Accumulate Units Accelerated General Sparse Matrix-Vector Multiplication. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201314 (2023)","DOI":"10.1145\/3581784.3607051"},{"key":"202_CR37","doi-asserted-by":"crossref","unstructured":"Mukunoki, D., Takahashi, D.: Optimization of sparse matrix-vector multiplication for CRS format on NVIDIA Kepler architecture GPUs. In: Computational Science and Its Applications\u2013ICCSA 2013: 13th International Conference, Ho Chi Minh City, Vietnam, June 24-27, 2013, Proceedings, Part V 13, pp. 211\u2013223 (2013). Springer","DOI":"10.1007\/978-3-642-39640-3_15"},{"key":"202_CR38","doi-asserted-by":"publisher","DOI":"10.1016\/j.cam.2019.112701","volume":"372","author":"D Mukunoki","year":"2020","unstructured":"Mukunoki, D., Ogita, T.: Performance and energy consumption of accurate and mixed-precision linear algebra kernels on GPUs. J. Comput. Appl. Math. 372, 112701 (2020)","journal-title":"J. Comput. Appl. Math."},{"key":"202_CR39","doi-asserted-by":"crossref","unstructured":"Ooi, R., Iwashita, T., Fukaya, T., Ida, A., Yokota, R.: Effect of mixed precision computing on H-matrix vector multiplication in BEM analysis. In: Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region, pp. 92\u2013101 (2020)","DOI":"10.1145\/3368474.3368479"},{"key":"202_CR40","doi-asserted-by":"crossref","unstructured":"Plemmons, R.J.: M-matrix characterizations. I\u2014nonsingular M-matrices. Linear Algebra and its applications 18(2), 175\u2013188 (1977)","DOI":"10.1016\/0024-3795(77)90073-8"},{"key":"202_CR41","doi-asserted-by":"crossref","unstructured":"Raihan, M.A., Goli, N., Aamodt, T.M.: Modeling deep learning accelerator enabled gpus. In: 2019 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), pp. 79\u201392 (2019). IEEE","DOI":"10.1109\/ISPASS.2019.00016"},{"issue":"3","key":"202_CR42","doi-asserted-by":"publisher","first-page":"856","DOI":"10.1137\/0907058","volume":"7","author":"Y Saad","year":"1986","unstructured":"Saad, Y., Schultz, M.H.: GMRES: A generalized minimal residual algorithm for solving nonsymmetric linear systems. SIAM J. Sci. Stat. Comput. 7(3), 856\u2013869 (1986)","journal-title":"SIAM J. Sci. Stat. Comput."},{"key":"202_CR43","unstructured":"St\u00fcben, K.: Algebraic multigrid (AMG). An introduction with applications (1999)"},{"key":"202_CR44","doi-asserted-by":"crossref","unstructured":"Tezcan, E., Torun, T., Ko\u015far, F., Kaya, K., Unat, D.: Mixed and multi-precision SpMV for GPUs with row-wise precision selection. In: 2022 IEEE 34th International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD), pp. 31\u201340 (2022). IEEE","DOI":"10.1109\/SBAC-PAD55451.2022.00014"},{"key":"202_CR45","doi-asserted-by":"crossref","unstructured":"Vinsome, P.K.: Orthomin, an iterative method for solving sparse sets of simultaneous linear equations. In: SPE Symposium on Numerical Simulation of Reservoir Performance, p. 5729 (1976). SPE","DOI":"10.2118\/5729-MS"},{"key":"202_CR46","doi-asserted-by":"crossref","unstructured":"Wang, Y., Chang, F., Wei, B., Gao, J., Ji, W.: Optimization of Sparse Matrix Computation for Algebraic Multigrid on GPUs. ACM Transactions on Architecture and Code Optimization (2024)","DOI":"10.1145\/3664924"},{"key":"202_CR47","doi-asserted-by":"crossref","unstructured":"Yoshizawa, H., Takahashi, D.: Automatic tuning of sparse matrix-vector multiplication for CRS format on GPUs. In: 2012 IEEE 15th International Conference on Computational Science and Engineering, pp. 130\u2013136 (2012). IEEE","DOI":"10.1109\/ICCSE.2012.28"},{"key":"202_CR48","unstructured":"Zhao, R., Hu, Y., Dotzel, J., De\u00a0Sa, C., Zhang, Z.: Improving neural network quantization without retraining using outlier channel splitting. In: International Conference on Machine Learning, pp. 7543\u20137552 (2019). PMLR"},{"key":"202_CR49","doi-asserted-by":"crossref","unstructured":"Zhao, Z., Zhang, G., Wu, Y., Hong, R., Yang, Y., Fu, Y.: Block-wise dynamic mixed-precision for sparse matrix-vector multiplication on GPUs. The Journal of Supercomputing, 1\u201333 (2024)","DOI":"10.1007\/s11227-024-05949-6"},{"key":"202_CR50","doi-asserted-by":"publisher","first-page":"525","DOI":"10.2197\/ipsjjip.30.525","volume":"30","author":"Y Zhao","year":"2022","unstructured":"Zhao, Y., Fukaya, T., Zhang, L., Iwashita, T.: Numerical Investigation into the Mixed Precision GMRES (m) Method Using FP64 and FP32. Journal of Information Processing 30, 525\u2013537 (2022)","journal-title":"Journal of Information Processing"}],"container-title":["CCF Transactions on High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-024-00202-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42514-024-00202-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-024-00202-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,18]],"date-time":"2025-04-18T09:54:56Z","timestamp":1744970096000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42514-024-00202-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,23]]},"references-count":50,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,4]]}},"alternative-id":["202"],"URL":"https:\/\/doi.org\/10.1007\/s42514-024-00202-1","relation":{},"ISSN":["2524-4922","2524-4930"],"issn-type":[{"type":"print","value":"2524-4922"},{"type":"electronic","value":"2524-4930"}],"subject":[],"published":{"date-parts":[[2025,3,23]]},"assertion":[{"value":"8 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 October 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 March 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"On behalf of all authors, the corresponding author states that there is no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}