{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T01:37:50Z","timestamp":1770687470462,"version":"3.49.0"},"reference-count":25,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T00:00:00Z","timestamp":1763424000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T00:00:00Z","timestamp":1763424000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"the National Key Research and Development Program of China","award":["2024YFB4504103"],"award-info":[{"award-number":["2024YFB4504103"]}]},{"name":"the Major Science and Technology Special Projects in Henan Province","award":["24111121230"],"award-info":[{"award-number":["24111121230"]}]},{"name":"the National Key Research and Development Program of China","award":["2023ZD0120604"],"award-info":[{"award-number":["2023ZD0120604"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. HPC"],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1007\/s42514-025-00254-x","type":"journal-article","created":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T15:00:05Z","timestamp":1763478005000},"page":"49-60","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Optimizing sparse-dense matrix\u2013matrix multiplication for DCUs"],"prefix":"10.1007","volume":"8","author":[{"given":"Hengliang","family":"Guo","sequence":"first","affiliation":[]},{"given":"Yubo","family":"Han","sequence":"additional","affiliation":[]},{"given":"Haolei","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Shengguang","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Gang","family":"Wu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3517-2005","authenticated-orcid":false,"given":"Yang","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Xiangdong","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Chuanqiang","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,18]]},"reference":[{"key":"254_CR1","unstructured":"AMD: HIP Documentation. https:\/\/rocm.docs.amd.com\/projects\/HIP\/en\/latest\/index.html (2022). Accessed 22 Dec 2024"},{"key":"254_CR2","unstructured":"AMD: ROCm Documentation. https:\/\/rocm.docs.amd.com\/en\/docs-6.0.0\/index.html (2024). Accessed 21 Dec 2024"},{"key":"254_CR3","unstructured":"AMD: ROCSPARSE Documentation. https:\/\/rocsparse.readthedocs.io\/en\/master\/ (2023). Accessed 22 Dec 2023"},{"key":"254_CR4","unstructured":"AMD: Vega 7nm Shader ISA. https:\/\/gpuopen.com\/wp-content\/uploads\/2019\/11\/Vega_7nm_Shader_ISA_26November2019.pdf (2019). Accessed 22 Dec 2024"},{"issue":"1","key":"254_CR5","first-page":"1","volume":"38","author":"TA Davis","year":"2011","unstructured":"Davis, T.A., Hu, Y.: The university of florida sparse matrix collection. ACM Trans. Math. Softw. (TOMS) 38(1), 1\u201325 (2011)","journal-title":"ACM Trans. Math. Softw. (TOMS)"},{"key":"254_CR6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s42514-024-00205-y","volume":"7","author":"W Fan","year":"2025","unstructured":"Fan, W., Hua, H., Shang, J., Wen, Z., Guo, H., Zhang, L.: Optimizing 2d convolution for DCUs. CCF Trans. High Perform. Comput. 7, 1\u201313 (2025)","journal-title":"CCF Trans. High Perform. Comput."},{"key":"254_CR7","doi-asserted-by":"crossref","unstructured":"Fan, R., Wang, W., Chu, X.: Dtc-spmm: Bridging the gap in accelerating general sparse matrix multiplication with tensor cores. In: Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3, pp. 253\u2013267 (2024)","DOI":"10.1145\/3620666.3651378"},{"key":"254_CR8","doi-asserted-by":"crossref","unstructured":"Fan, R., Wang, W., Chu, X.: Fast sparse gpu kernels for accelerated training of graph neural networks. In: 2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 501\u2013511. IEEE. (2023)","DOI":"10.1109\/IPDPS54959.2023.00057"},{"key":"254_CR9","doi-asserted-by":"crossref","unstructured":"Gale, T., Zaharia, M., Young, C., Elsen, E.: Sparse gpu kernels for deep learning. In: SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201314. IEEE (2020)","DOI":"10.1109\/SC41405.2020.00021"},{"key":"254_CR10","doi-asserted-by":"crossref","unstructured":"Guo, M., Wang, Y., Gu, Y., Chen, Y., Liu, H., Chen, H., Han, D., Xu, H., Deng, C., Tang, P., et al.: Bs-spmm: Accelerate sparse matrix-matrix multiplication by balanced split strategy on the gpu. In: IEEE INFOCOM 2023-IEEE Conference on Computer Communications Workshops (INFOCOM WKSHPS), pp. 1\u20136. IEEE. (2023)","DOI":"10.1109\/INFOCOMWKSHPS57453.2023.10226061"},{"key":"254_CR11","doi-asserted-by":"crossref","unstructured":"Guo, M., Wang, Y., Huang, J., Wang, Q., Zhang, Y., Xu, M., Lu, F.: Rgs-spmm: Accelerate sparse matrix-matrix multiplication by row group splitting strategy on the gpu. In: IFIP International Conference on Network and Parallel Computing, pp. 61\u201366. Springer. (2022)","DOI":"10.1007\/978-3-031-21395-3_6"},{"issue":"10","key":"254_CR12","doi-asserted-by":"publisher","first-page":"14085","DOI":"10.1007\/s11227-024-05996-z","volume":"80","author":"P Han","year":"2024","unstructured":"Han, P., Hua, H., Wang, H., Xue, F., Wu, C., Shang, J.: A universal parallel simulation framework for energy pipeline networks on high-performance computers. J. Supercomput. 80(10), 14085\u201314115 (2024)","journal-title":"J. Supercomput."},{"key":"254_CR13","doi-asserted-by":"crossref","unstructured":"Hong, C., Sukumaran-Rajam, A., Nisa, I., Singh, K., Sadayappan, P.: Adaptive sparse tiling for sparse matrix multiplication. In: Proceedings of the 24th Symposium on Principles and Practice of Parallel Programming, pp. 300\u2013314 (2019)","DOI":"10.1145\/3293883.3295712"},{"key":"254_CR14","doi-asserted-by":"crossref","unstructured":"Hu, Y., Ye, Z., Wang, M., Yu, J., Zheng, D., Li, M., Zhang, Z., Zhang, Z., Wang, Y.: Featgraph: A flexible and efficient backend for graph neural network systems. In: SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201313. IEEE. (2020)","DOI":"10.1109\/SC41405.2020.00075"},{"key":"254_CR15","doi-asserted-by":"crossref","unstructured":"Huang, G., Dai, G., Wang, Y., Yang, H.: Ge-spmm: General-purpose sparse matrix-matrix multiplication on gpus for graph neural networks. In: SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201312. IEEE. (2020)","DOI":"10.1109\/SC41405.2020.00076"},{"key":"254_CR16","doi-asserted-by":"crossref","unstructured":"Jiang, P., Hong, C., Agrawal, G.: A novel data transformation and execution strategy for accelerating sparse matrix multiplication on gpus. In: Proceedings of the 25th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, pp. 376\u2013388 (2020)","DOI":"10.1145\/3332466.3374546"},{"key":"254_CR17","doi-asserted-by":"crossref","unstructured":"Liu, Z., Hao, M., Zhang, W., Lu, G., Tian, X., Yang, S., Xie, M., Dai, J., Yuan, C., Wang, D., et al.: Optimizing depthwise separable convolution on dcu. CCF Transactions on High Performance Computing, 1\u201319 (2024)","DOI":"10.1007\/s42514-024-00200-3"},{"key":"254_CR18","unstructured":"Narang, S., Elsen, E., Diamos, G., Sengupta, S.: Exploring sparsity in recurrent neural networks. Preprint at arXiv:1704.05119 (2017)"},{"key":"254_CR19","doi-asserted-by":"crossref","unstructured":"Pang, M., Fei, X., Qu, P., Zhang, Y., Li, Z.: A row decomposition-based approach for sparse matrix multiplication on gpus. In: Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, pp. 377\u2013389 (2024)","DOI":"10.1145\/3627535.3638470"},{"key":"254_CR20","doi-asserted-by":"crossref","unstructured":"Shi, J., Li, S., Xu, Y., Fu, R., Wang, X., Wu, T.: Flashsparse: Minimizing computation redundancy for fast sparse matrix multiplications on tensor cores. In: Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, pp. 312\u2013325 (2025)","DOI":"10.1145\/3710848.3710858"},{"issue":"4","key":"254_CR21","doi-asserted-by":"publisher","first-page":"917","DOI":"10.1137\/0916053","volume":"16","author":"V Simoncini","year":"1995","unstructured":"Simoncini, V., Gallopoulos, E.: An iterative method for nonsymmetric systems with multiple right-hand sides. SIAM J. Sci. Comput. 16(4), 917\u2013933 (1995)","journal-title":"SIAM J. Sci. Comput."},{"key":"254_CR22","doi-asserted-by":"crossref","unstructured":"Tiskin, A.: All-pairs shortest paths computation in the bsp model. In: International Colloquium on Automata, Languages, and Programming, pp. 178\u2013189. Springer. (2001)","DOI":"10.1007\/3-540-48224-5_15"},{"key":"254_CR23","unstructured":"Wang, M.Y.: Deep graph library: Towards efficient and scalable deep learning on graphs. In: ICLR Workshop on Representation Learning on Graphs and Manifolds (2019)"},{"key":"254_CR24","doi-asserted-by":"crossref","unstructured":"Yang, C., Bulu\u00e7, A., Owens, J.D.: Design principles for sparse matrix multiplication on the gpu. In: European Conference on Parallel Processing, pp. 672\u2013687. Springer. (2018)","DOI":"10.1007\/978-3-319-96983-1_48"},{"key":"254_CR25","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Ren, A., Chen, X., Lin, Q., Tan, Y., Liu, D.: Re-compact: Structured pruning and spmm kernel co-design for accelerating dnns on gpus. 2023 IEEE 41st International Conference on Computer Design (ICCD), 399\u2013406 (2023)","DOI":"10.1109\/ICCD58817.2023.00066"}],"container-title":["CCF Transactions on High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-025-00254-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42514-025-00254-x","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-025-00254-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T08:55:59Z","timestamp":1770627359000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42514-025-00254-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,18]]},"references-count":25,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,2]]}},"alternative-id":["254"],"URL":"https:\/\/doi.org\/10.1007\/s42514-025-00254-x","relation":{},"ISSN":["2524-4922","2524-4930"],"issn-type":[{"value":"2524-4922","type":"print"},{"value":"2524-4930","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,18]]},"assertion":[{"value":"7 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 September 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 November 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}