{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T02:39:40Z","timestamp":1770691180306,"version":"3.49.0"},"reference-count":28,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T00:00:00Z","timestamp":1764288000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T00:00:00Z","timestamp":1764288000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2023ZD0120604"],"award-info":[{"award-number":["2023ZD0120604"]}]},{"name":"National Key Research and Development Program of China","award":["2024YFB4504103"],"award-info":[{"award-number":["2024YFB4504103"]}]},{"name":"Major Science and Technology Special projects in Henan Province","award":["241111212300"],"award-info":[{"award-number":["241111212300"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. HPC"],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1007\/s42514-025-00261-y","type":"journal-article","created":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T15:24:30Z","timestamp":1764343470000},"page":"107-119","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Optimizing winograd-based convolution with DCU\u2019s matrix cores"],"prefix":"10.1007","volume":"8","author":[{"given":"Jiandong","family":"Shang","sequence":"first","affiliation":[]},{"given":"Fuchang","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Zhaopeng","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yizhe","family":"Sui","sequence":"additional","affiliation":[]},{"given":"Gang","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Nan","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Lingling","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2733-3064","authenticated-orcid":false,"given":"Dujuan","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,28]]},"reference":[{"key":"261_CR1","unstructured":"AMD GFX9 instructions documentation. https:\/\/llvm.org\/docs\/AMDGPU\/AMDGPUAsmGFX9.html (2024)"},{"key":"261_CR2","unstructured":"AMD Hip documentation. https:\/\/rocm.docs.amd.com\/projects\/HIP\/en\/latest\/index.html (2023)"},{"key":"261_CR3","unstructured":"AMD Matrix cores. https:\/\/rocm.blogs.amd.com\/software-tools-optimization\/matrix-cores\/README.html (2024)"},{"key":"261_CR4","unstructured":"AMD Rocm documentation. https:\/\/rocm.docs.amd.com\/en\/latest\/index.html (2024)"},{"issue":"17","key":"261_CR5","doi-asserted-by":"publisher","first-page":"2033","DOI":"10.3390\/math9172033","volume":"9","author":"RL Castro","year":"2021","unstructured":"Castro, R.L., Andrade, D., Fraguela, B.B.: Opencnn: a winograd minimal filtering algorithm implementation in cuda. Mathematics 9(17), 2033 (2021)","journal-title":"Mathematics"},{"key":"261_CR6","unstructured":"Chetlur, S., Woolley, C., Vandermersch, P., Cohen, J., Tran, J., Catanzaro, B., Shelhamer, E.: cudnn: efficient primitives for deep learning. arXiv preprint arXiv:1410.0759 (2014)"},{"key":"261_CR7","first-page":"229","volume":"38","author":"E Fathi","year":"2018","unstructured":"Fathi, E., Shoja, B..M.: Deep neural networks for natural language processing. In: Handbook of Statistics 38, 229\u2013316. Elsevier (2018)","journal-title":"In: Handbook of Statistics"},{"key":"261_CR8","doi-asserted-by":"crossref","unstructured":"Gao, H., Gao, Y., Ji, Q.: Implementation and optimization of the winograd convolution algorithm for domestic gpu-like accelerators: winograd: convolution algorithm weight gradient optimization to accelerate convolution computation in cnn. In: Proceedings of the 3rd international conference on computer, artificial intelligence and control engineering, pp. 322\u2013328 (2024)","DOI":"10.1145\/3672758.3672811"},{"key":"261_CR9","unstructured":"Gui, H., Zhang, X., Zhang, C., Su, Z., Li, H.: Optimizing winograd convolution on armv8 manycore processors. arXiv preprint arXiv:2411.16152 (2024)"},{"issue":"17","key":"261_CR10","doi-asserted-by":"publisher","first-page":"19547","DOI":"10.1007\/s11227-023-05399-6","volume":"79","author":"Y Guo","year":"2023","unstructured":"Guo, Y., Lu, L., Zhu, S.: Novel accelerated methods for convolution neural network with matrix core. J. Supercomput. 79(17), 19547\u201319573 (2023)","journal-title":"J. Supercomput."},{"issue":"7","key":"261_CR11","first-page":"986","volume":"69","author":"L Jia","year":"2020","unstructured":"Jia, L., Liang, Y., Li, X., Lu, L., Yan, S.: Enabling efficient fast convolution algorithms on gpus via megakernels. IEEE Trans. Comput. 69(7), 986\u2013997 (2020)","journal-title":"IEEE Trans. Comput."},{"key":"261_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2022.102954","volume":"113","author":"J Jiang","year":"2022","unstructured":"Jiang, J., Huang, D., Du, J., Lu, Y., Liao, X.: Optimizing small channel 3d convolution on gpu with tensor core. Parallel Comput. 113, 102954 (2022)","journal-title":"Parallel Comput."},{"issue":"2","key":"261_CR13","doi-asserted-by":"publisher","first-page":"1459","DOI":"10.1007\/s10586-021-03494-y","volume":"25","author":"M Jord\u00e0","year":"2022","unstructured":"Jord\u00e0, M., Valero-Lara, P., Pe\u00f1a, A.J.: Cuconv: cuda implementation of convolution for cnn inference. Clust. Comput. 25(2), 1459\u20131473 (2022)","journal-title":"Clust. Comput."},{"key":"261_CR14","unstructured":"Khan, J., Fultz, P., Tamazov, A., Lowell, D., Liu, C., Melesse, M., Nandhimandalam, M., Nasyrov, K., Perminov, I., Shah, T., et al.: Miopen: an open source library for deep learning primitives. arXiv preprint arXiv:1910.00078 (2019)"},{"key":"261_CR15","doi-asserted-by":"crossref","unstructured":"Lavin, A., Gray, S.: Fast algorithms for convolutional neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 4013\u20134021 (2016)","DOI":"10.1109\/CVPR.2016.435"},{"key":"261_CR16","doi-asserted-by":"crossref","unstructured":"Liu, Z., Hao, M., Zhang, W., Lu, G., Tian, X., Yang, S., Xie, M., Dai, J., Yuan, C., Wang, D., et al.: Optimizing depthwise separable convolution on dcu. CCF transactions on high performance computing, pp. 1\u201319 (2024)","DOI":"10.1007\/s42514-024-00200-3"},{"key":"261_CR17","doi-asserted-by":"crossref","unstructured":"Liu, J., Yang, D., Lai, J.: Optimizing winograd-based convolution with tensor cores. In: proceedings of the 50th international conference on parallel processing, pp. 1\u201310 (2021)","DOI":"10.1145\/3472456.3472473"},{"issue":"1","key":"261_CR18","doi-asserted-by":"publisher","first-page":"70","DOI":"10.1109\/TPDS.2021.3084813","volume":"33","author":"G Lu","year":"2021","unstructured":"Lu, G., Zhang, W., Wang, Z.: Optimizing depthwise separable convolution operations on gpus. IEEE Trans. Parallel Distrib. Syst. 33(1), 70\u201387 (2021)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"261_CR19","unstructured":"Mathieu, M., Henaff, M., LeCun, Y.: Fast training of convolutional networks through ffts. arXiv preprint arXiv:1312.5851 (2013)"},{"key":"261_CR20","unstructured":"Micikevicius, P., Narang, S., Alben, J., Diamos, G., Elsen, E., Garcia, D., Ginsburg, B., Houston, M., Kuchaiev, O., Venkatesh, G., et al.: Mixed precision training. arXiv preprint arXiv:1710.03740 (2017)"},{"issue":"6s","key":"261_CR21","first-page":"2161","volume":"29","author":"R Patel","year":"2020","unstructured":"Patel, R., Patel, S.: A comprehensive study of applying convolutional neural network for computer vision. Int. J. Adv. Sci. Technol. 29(6s), 2161\u20132174 (2020)","journal-title":"International Journal of Advanced Science and Technology"},{"issue":"9","key":"261_CR22","doi-asserted-by":"publisher","first-page":"2352","DOI":"10.1162\/neco_a_00990","volume":"29","author":"W Rawat","year":"2017","unstructured":"Rawat, W., Wang, Z.: Deep convolutional neural networks for image classification: a comprehensive review. Neural Comput. 29(9), 2352\u20132449 (2017)","journal-title":"Neural Comput."},{"key":"261_CR23","doi-asserted-by":"crossref","unstructured":"Song, Z., Wang, J., Li, T., Jiang, L., Ke, J., Liang, X., Jing, N.: Gpnpu: Enabling efficient hardware-based direct convolution with multi-precision support in gpu tensor cores. In: 2020 57th ACM\/IEEE design automation conference (DAC), IEEE, pp. 1\u20136 (2020).","DOI":"10.1109\/DAC18072.2020.9218566"},{"key":"261_CR24","unstructured":"Thread block swizzle. https:\/\/github.com\/NVIDIA\/cutlass\/blob\/main\/include\/cutlass\/gemm\/threadblock\/threadblock_swizzle.h (2024)"},{"key":"261_CR25","unstructured":"Vardhana, M., Pinto, R.: High-performance winograd based accelerator architecture for convolutional neural network. IEEE Computer Architecture Letters (2025)"},{"key":"261_CR26","doi-asserted-by":"crossref","unstructured":"Wang, Q., Mei, S., Liu, J., Gong, C.: Parallel convolution algorithm using implicit matrix multiplication on multi-core cpus. In: 2019 international joint conference on neural networks (ijcnn), IEEE, pp. 1\u20137 (2019)","DOI":"10.1109\/IJCNN.2019.8852012"},{"key":"261_CR27","doi-asserted-by":"crossref","unstructured":"Wei, H., Liu, E., Zhao, Y., Yu, H.: Efficient non-fused winograd on gpus. In: advances in computer graphics: 37th computer graphics international conference, CGI 2020, Proceedings 37, Geneva, Switzerland, Springer, 20\u201323 October 2020, pp. 411\u2013418 (2020)","DOI":"10.1007\/978-3-030-61864-3_35"},{"key":"261_CR28","doi-asserted-by":"crossref","unstructured":"Yan, D., Wang, W., Chu, X.: Optimizing batched winograd convolution on gpus. In: Proceedings of the 25th ACM SIGPLAN symposium on principles and practice of parallel programming, pp. 32\u201344 (2020)","DOI":"10.1145\/3332466.3374520"}],"container-title":["CCF Transactions on High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-025-00261-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42514-025-00261-y","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-025-00261-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T08:55:52Z","timestamp":1770627352000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42514-025-00261-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,28]]},"references-count":28,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,2]]}},"alternative-id":["261"],"URL":"https:\/\/doi.org\/10.1007\/s42514-025-00261-y","relation":{},"ISSN":["2524-4922","2524-4930"],"issn-type":[{"value":"2524-4922","type":"print"},{"value":"2524-4930","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,28]]},"assertion":[{"value":"9 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 September 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 November 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}