{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T08:04:27Z","timestamp":1770710667898,"version":"3.49.0"},"reference-count":34,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,2,22]],"date-time":"2025-02-22T00:00:00Z","timestamp":1740182400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,22]],"date-time":"2025-02-22T00:00:00Z","timestamp":1740182400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Major Science and Technology Special Project in Henan Provience","award":["221100210600"],"award-info":[{"award-number":["221100210600"]}]},{"name":"Scientific Research Team Plan of the Zhengzhou University of Aeronautics","award":["23ZHTD01003"],"award-info":[{"award-number":["23ZHTD01003"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. HPC"],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1007\/s42514-024-00205-y","type":"journal-article","created":{"date-parts":[[2025,2,22]],"date-time":"2025-02-22T17:53:16Z","timestamp":1740246796000},"page":"142-154","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Optimizing 2D convolution for DCUs"],"prefix":"10.1007","volume":"7","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-0378-6115","authenticated-orcid":false,"given":"Wenlong","family":"Fan","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8015-6392","authenticated-orcid":false,"given":"Haobo","family":"Hua","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7673-2641","authenticated-orcid":false,"given":"Jiandong","family":"Shang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6991-6013","authenticated-orcid":false,"given":"Zhuxin","family":"Wen","sequence":"additional","affiliation":[]},{"given":"Hengliang","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Litao","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,22]]},"reference":[{"key":"205_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40537-021-00444-8","volume":"8","author":"L Alzubaidi","year":"2021","unstructured":"Alzubaidi, L., Zhang, J., Humaidi, A.J., Al-Dujaili, A., Duan, Y., Al-Shamma, O., Santamar\u00eda, J., Fadhel, M.A., Al-Amidie, M., Farhan, L.: Review of deep learning: concepts, cnn architectures, challenges, applications, future directions. J. Big Data 8, 1\u201374 (2021)","journal-title":"J. Big Data"},{"issue":"2","key":"205_CR2","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1109\/MCI.2014.2307227","volume":"9","author":"E Cambria","year":"2014","unstructured":"Cambria, E., White, B.: Jumping NLP curves: a review of natural language processing research. IEEE Comput. Intell. Mag. 9(2), 48\u201357 (2014)","journal-title":"IEEE Comput. Intell. Mag."},{"key":"205_CR3","doi-asserted-by":"crossref","unstructured":"Cao, K., Wu, Q., Wang, L., Guo, H., Wang, N., Cheng, H., Tang, X., Liu, L., Li, D., Wu, H., : GPU-HADVPPM4HIP V1. 0: higher model accuracy on china\u2019s domestically GPU-like accelerator using heterogeneous compute interface for portability (HIP) technology to accelerate the piecewise parabolic method (PPM) in an air quality model (CAMx v6. 10). Geoscientific Model Development Discussions 2024, 1\u201322 (2024)","DOI":"10.5194\/gmd-2023-222"},{"key":"205_CR4","unstructured":"Chetlur, S., Woolley, C., Vandermersch, P., Cohen, J., Tran, J., Catanzaro, B., Shelhamer, E.: cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759 (2014)"},{"key":"205_CR5","unstructured":"Detailed explanation of Split-k technology for cutlass in Github. https:\/\/github.com\/NVIDIA\/cutlass\/blob\/main\/examples\/06_splitK_gemm\/splitk_gemm.cu (2024). Accessed 18 June 2024"},{"issue":"1","key":"205_CR6","first-page":"1","volume":"19","author":"Y Fu","year":"2021","unstructured":"Fu, Y., Bolotin, E., Chatterjee, N., Nellans, D., Keckler, S.W.: GPU domain specialization via composable on-package architecture. ACM Trans. Arch. Code Optim. (TACO) 19(1), 1\u201323 (2021)","journal-title":"ACM Trans. Arch. Code Optim. (TACO)"},{"issue":"3","key":"205_CR7","first-page":"16","volume":"10","author":"SK Gaikwad","year":"2010","unstructured":"Gaikwad, S.K., Gawali, B.W., Yannawar, P.: A review on speech recognition technique. Int. J. Comput. Appl. 10(3), 16\u201324 (2010)","journal-title":"Int. J. Comput. Appl."},{"key":"205_CR8","doi-asserted-by":"crossref","unstructured":"Gajurel, A., Louis, S.J., Wu, R., Barford, L., Harris\u00a0Jr, F.C.: GPU acceleration of sparse neural networks. In: ITNG 2021 18th International Conference on Information Technology-New Generations, pp. 323\u2013330 (2021). Springer","DOI":"10.1007\/978-3-030-70416-2_41"},{"key":"205_CR10","doi-asserted-by":"publisher","DOI":"10.1016\/j.energy.2023.128179","volume":"282","author":"P Han","year":"2023","unstructured":"Han, P., Hua, H., Wang, H., Shang, J.: A graphic partition method based on nodes learning for energy pipelines network simulation. Energy 282, 128179 (2023)","journal-title":"Energy"},{"key":"205_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s11227-024-05996-z","volume":"80","author":"P Han","year":"2024","unstructured":"Han, P., Hua, H., Wang, H., Xue, F., Wu, C., Shang, J.: A universal parallel simulation framework for energy pipeline networks on high-performance computers. J. Supercomput. 80, 1\u201331 (2024). https:\/\/doi.org\/10.1007\/s11227-024-05996-z","journal-title":"J. Supercomput."},{"key":"205_CR11","doi-asserted-by":"publisher","first-page":"167","DOI":"10.1016\/bs.adcom.2020.11.003","volume":"122","author":"W Jeon","year":"2021","unstructured":"Jeon, W., Ko, G., Lee, J., Lee, H., Ro, W.W.: Deep learning with GPUs. Adv. Comput. 122, 167\u2013215 (2021). https:\/\/doi.org\/10.1016\/bs.adcom.2020.11.003","journal-title":"Adv. Comput."},{"key":"205_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s42514-023-00178-4","volume":"6","author":"J Jia","year":"2024","unstructured":"Jia, J., Lin, X., Lin, F., Liu, Y.: Dcu-chk: checkpointing for large-scale cpu-dcu heterogeneous computing systems. CCF Trans. High Perform. Comput. 6, 1\u201314 (2024). https:\/\/doi.org\/10.1007\/s42514-023-00178-4","journal-title":"CCF Trans. High Perform. Comput."},{"key":"205_CR13","doi-asserted-by":"crossref","unstructured":"Jogin, M., Madhulika, M., Divya, G., Meghana, R., Apoorva, S.: Feature extraction using convolution neural networks (CNN) and deep learning. In: 2018 3rd IEEE International Conference on Recent Trends in Electronics, Information & Communication Technology (RTEICT), pp. 2319\u20132323 (2018). IEEE","DOI":"10.1109\/RTEICT42901.2018.9012507"},{"key":"205_CR14","doi-asserted-by":"publisher","first-page":"70461","DOI":"10.1109\/ACCESS.2019.2918851","volume":"7","author":"M Jorda","year":"2019","unstructured":"Jorda, M., Valero-Lara, P., Pena, A.J.: Performance evaluation of cudnn convolution algorithms on nvidia volta GPUs. IEEE Access 7, 70461\u201370473 (2019)","journal-title":"IEEE Access"},{"issue":"2","key":"205_CR15","doi-asserted-by":"publisher","first-page":"1459","DOI":"10.1007\/s10586-021-03494-y","volume":"25","author":"M Jord\u00e0","year":"2022","unstructured":"Jord\u00e0, M., Valero-Lara, P., Pe\u00f1a, A.J.: cuconv: Cuda implementation of convolution for cnn inference. Clust. Comput. 25(2), 1459\u20131473 (2022)","journal-title":"Clust. Comput."},{"key":"205_CR16","unstructured":"Kenton, J.D.M.-W.C., Toutanova, L.K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of naacL-HLT, vol. 1, p. 2 (2019)"},{"key":"205_CR17","unstructured":"Khan, J., Fultz, P., Tamazov, A., Lowell, D., Liu, C., Melesse, M., Nandhimandalam, M., Nasyrov, K., Perminov, I., Shah, T., et al.: Miopen: An open source library for deep learning primitives. arXiv preprint arXiv:1910.00078 (2019)"},{"key":"205_CR18","doi-asserted-by":"publisher","first-page":"5455","DOI":"10.1007\/s10462-020-09825-6","volume":"53","author":"A Khan","year":"2020","unstructured":"Khan, A., Sohail, A., Zahoora, U., Qureshi, A.S.: A survey of the recent architectures of deep convolutional neural networks. Artif. Intell. Rev. 53, 5455\u20135516 (2020)","journal-title":"Artif. Intell. Rev."},{"key":"205_CR19","doi-asserted-by":"crossref","unstructured":"Lavin, A., Gray, S.: Fast algorithms for convolutional neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4013\u20134021 (2016)","DOI":"10.1109\/CVPR.2016.435"},{"key":"205_CR23","doi-asserted-by":"crossref","unstructured":"Li, S., Zhang, Y., Xiang, C., Shi, L.: Fast convolution operations on many-core architectures. In: 2015 IEEE 17th International Conference on High Performance Computing and Communications, 2015 IEEE 7th International Symposium on Cyberspace Safety and Security, and 2015 IEEE 12th International Conference on Embedded Software and Systems, pp. 316\u2013323 (2015). IEEE","DOI":"10.1109\/HPCC-CSS-ICESS.2015.94"},{"key":"205_CR21","doi-asserted-by":"crossref","unstructured":"Li, C., Yang, Y., Feng, M., Chakradhar, S., Zhou, H.: Optimizing memory efficiency for deep convolutional neural networks on GPUs. In: SC\u201916: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 633\u2013644 (2016a). IEEE","DOI":"10.1109\/SC.2016.53"},{"key":"205_CR22","doi-asserted-by":"crossref","unstructured":"Li, X., Zhang, G., Huang, H.H., Wang, Z., Zheng, W.: Performance analysis of GPU-based convolutional neural networks. In: 2016 45th International Conference on Parallel Processing (ICPP), pp. 67\u201376 (2016b). IEEE","DOI":"10.1109\/ICPP.2016.15"},{"key":"205_CR20","doi-asserted-by":"crossref","unstructured":"Li, X., Liang, Y., Yan, S., Jia, L., Li, Y.: A coordinated tiling and batching framework for efficient gemm on GPUs. In: Proceedings of the 24th Symposium on Principles and Practice of Parallel Programming, pp. 229\u2013241 (2019)","DOI":"10.1145\/3293883.3295734"},{"issue":"1","key":"205_CR24","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1186\/s13636-023-00290-x","volume":"2023","author":"T Liu","year":"2023","unstructured":"Liu, T., Yuan, X.: Paralinguistic and spectral feature extraction for speech emotion classification using machine learning techniques. EURASIP J. Audio Speech Music Process. 2023(1), 23 (2023)","journal-title":"EURASIP J. Audio Speech Music Process."},{"issue":"1","key":"205_CR25","doi-asserted-by":"publisher","first-page":"70","DOI":"10.1109\/TPDS.2021.3084813","volume":"33","author":"G Lu","year":"2021","unstructured":"Lu, G., Zhang, W., Wang, Z.: Optimizing depthwise separable convolution operations on GPUs. IEEE Trans. Parallel Distrib. Syst. 33(1), 70\u201387 (2021)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"205_CR26","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysarc.2019.101635","volume":"99","author":"S Mittal","year":"2019","unstructured":"Mittal, S., Vaishay, S.: A survey of techniques for optimizing deep learning on GPUs. J. Syst. Arch. 99, 101635 (2019)","journal-title":"J. Syst. Arch."},{"issue":"5","key":"205_CR27","doi-asserted-by":"publisher","first-page":"5979","DOI":"10.1007\/s11063-022-11122-y","volume":"55","author":"E Parsaeimehr","year":"2023","unstructured":"Parsaeimehr, E., Fartash, M., Akbari Torkestani, J.: Improving feature extraction using a hybrid of CNN and LSTM for entity identification. Neural Process. Lett. 55(5), 5979\u20135994 (2023)","journal-title":"Neural Process. Lett."},{"issue":"12","key":"205_CR28","doi-asserted-by":"publisher","first-page":"2295","DOI":"10.1109\/JPROC.2017.2761740","volume":"105","author":"V Sze","year":"2017","unstructured":"Sze, V., Chen, Y.-H., Yang, T.-J., Emer, J.S.: Efficient processing of deep neural networks: a tutorial and survey. Proc. IEEE 105(12), 2295\u20132329 (2017)","journal-title":"Proc. IEEE"},{"key":"205_CR29","unstructured":"Vasilache, N., Johnson, J., Mathieu, M., Chintala, S., Piantino, S., LeCun, Y.: Fast convolutional nets with fbfft: A GPU performance evaluation. arXiv preprint arXiv:1412.7580 (2014)"},{"key":"205_CR30","doi-asserted-by":"crossref","unstructured":"Wang, Q., Mei, S., Liu, J., Gong, C.: Parallel convolution algorithm using implicit matrix multiplication on multi-core cpus. In: 2019 International Joint Conference on Neural Networks (ijcnn), pp. 1\u20137 (2019). IEEE","DOI":"10.1109\/IJCNN.2019.8852012"},{"key":"205_CR31","doi-asserted-by":"crossref","unstructured":"Yan, D., Wang, W., Chu, X.: Optimizing batched winograd convolution on GPUs. In: Proceedings of the 25th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, pp. 32\u201344 (2020)","DOI":"10.1145\/3332466.3374520"},{"issue":"1","key":"205_CR32","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1007\/s11760-022-02212-4","volume":"17","author":"T Zhang","year":"2023","unstructured":"Zhang, T., Li, S., Feng, G., Liang, J., He, L., Zhao, X.: Local channel transformation for efficient convolutional neural network. SIViP 17(1), 129\u2013137 (2023)","journal-title":"SIViP"},{"key":"205_CR33","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Yang, M., Guo, C., Leng, J., Liang, Y., Chen, Q., Guo, M., Zhu, Y.: Characterizing and demystifying the implicit convolution algorithm on commercial matrix-multiplication accelerators. In: 2021 IEEE International Symposium on Workload Characterization (IISWC), pp. 214\u2013225 (2021). IEEE","DOI":"10.1109\/IISWC53511.2021.00029"},{"issue":"5","key":"205_CR34","doi-asserted-by":"publisher","first-page":"831","DOI":"10.1109\/TVLSI.2018.2791442","volume":"26","author":"M Zhu","year":"2018","unstructured":"Zhu, M., Zhuo, Y., Wang, C., Chen, W., Xie, Y.: Performance evaluation and optimization of HBM-enabled GPU for data-intensive applications. IEEE Trans. Very Large Scale Integr. (VLSI) Syst. 26(5), 831\u2013840 (2018)","journal-title":"IEEE Trans. Very Large Scale Integr. (VLSI) Syst."}],"container-title":["CCF Transactions on High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-024-00205-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42514-024-00205-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-024-00205-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,18]],"date-time":"2025-04-18T09:54:55Z","timestamp":1744970095000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42514-024-00205-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,22]]},"references-count":34,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,4]]}},"alternative-id":["205"],"URL":"https:\/\/doi.org\/10.1007\/s42514-024-00205-y","relation":{},"ISSN":["2524-4922","2524-4930"],"issn-type":[{"value":"2524-4922","type":"print"},{"value":"2524-4930","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,22]]},"assertion":[{"value":"9 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 October 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}