{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T23:42:29Z","timestamp":1770680549728,"version":"3.49.0"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,10,28]],"date-time":"2025-10-28T00:00:00Z","timestamp":1761609600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,28]],"date-time":"2025-10-28T00:00:00Z","timestamp":1761609600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61972408"],"award-info":[{"award-number":["61972408"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. HPC"],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1007\/s42514-025-00248-9","type":"journal-article","created":{"date-parts":[[2025,10,28]],"date-time":"2025-10-28T08:39:47Z","timestamp":1761640787000},"page":"22-36","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Optimizing small matrix multiplications via batch grouping on multi-core DSPs"],"prefix":"10.1007","volume":"8","author":[{"given":"Xiaotian","family":"Chen","sequence":"first","affiliation":[]},{"given":"Pengyu","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3542-4869","authenticated-orcid":false,"given":"Jianbin","family":"Fang","sequence":"additional","affiliation":[]},{"given":"Peng","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Chun","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,10,28]]},"reference":[{"key":"248_CR1","unstructured":"A scientific software for the numerical simulation of seismic wave phenomena and earthquake dynamics. http:\/\/www.seissol.org\/"},{"key":"248_CR2","doi-asserted-by":"publisher","unstructured":"Ali, M., Stotzer, E., Igual, F.D., Geijn, R.A.: Level-3 BLAS on the TI C6678 multi-core DSP. In: Panetta, J., Moreira, J.E., Padua, D.A., Navaux, P.O.A. (eds.) IEEE 24th International Symposium on Computer Architecture and High Performance Computing, SBAC-PAD 2012, New York, NY, USA, October 24\u201326, 2012, pp. 179\u2013186. IEEE Computer Society. https:\/\/doi.org\/10.1109\/SBAC-PAD.2012.26","DOI":"10.1109\/SBAC-PAD.2012.26"},{"key":"248_CR3","doi-asserted-by":"publisher","unstructured":"Ben, M.D., Yang, C., Li, Z., Jornada, F.H., Louie, S.G., Deslippe, J.: Accelerating large-scale excited-state GW calculations on leadership HPC systems. In: Cuicchi, C., Qualters, I., Kramer, W.T. (eds.) Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2020, Virtual Event \/ Atlanta, Georgia, USA, November 9\u201319, p. 4. IEEE\/ACM (2020). https:\/\/doi.org\/10.1109\/SC41405.2020.00008","DOI":"10.1109\/SC41405.2020.00008"},{"key":"248_CR4","doi-asserted-by":"publisher","unstructured":"Bi, D., Li, S., Zhang, Y., Yang, X., Dong, D.: Efficiently running spmv on multi-core DSPs for banded matrix. In: Tari, Z., Li, K., Wu, H. (eds.) Algorithms and Architectures for Parallel Processing\u201423rd International Conference, ICA3PP 2023, Tianjin, China, October 20\u201322, 2023, Proceedings, Part V. Lecture Notes in Computer Science, vol. 14491, pp. 201\u2013220. Springer. https:\/\/doi.org\/10.1007\/978-981-97-0808-6_12","DOI":"10.1007\/978-981-97-0808-6_12"},{"key":"248_CR5","doi-asserted-by":"publisher","unstructured":"Chen, Y., Li, K., Wang, Y., Bai, D., Wang, L., Ma, L., Yuan, L., Zhang, Y., Cao, T., Yang, M.: Convstencil: transform stencil computation to matrix multiplication on tensor cores. In: Steuwer, M., Lee, I.A., Chabbi, M. (eds.) Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, PPoPP 2024, Edinburgh, United Kingdom, March 2\u20136, pp. 333\u2013347. ACM (2024). https:\/\/doi.org\/10.1145\/3627535.3638476","DOI":"10.1145\/3627535.3638476"},{"issue":"3","key":"248_CR6","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1145\/3595178","volume":"49","author":"S Deshmukh","year":"2023","unstructured":"Deshmukh, S., Yokota, R., Bosilca, G.: Cache optimization and performance modeling of batched, small, and rectangular matrix multiplication on Intel, AMD, and Fujitsu processors. ACM Trans. Math. Softw. 49(3), 23\u201312329 (2023). https:\/\/doi.org\/10.1145\/3595178","journal-title":"ACM Trans. Math. Softw."},{"key":"248_CR7","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Burstein, J., Doran, C., Solorio, T. (eds.) Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019, Minneapolis, MN, USA, June 2\u20137, Volume 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics (2019). https:\/\/doi.org\/10.18653\/V1\/N19-1423","DOI":"10.18653\/V1\/N19-1423"},{"key":"248_CR8","doi-asserted-by":"publisher","unstructured":"Dongarra, J.J., Hammarling, S., Higham, N.J., Relton, S.D., Valero-Lara, P., Zounon, M.: The design and performance of batched BLAS on modern high-performance computing systems. In: Koumoutsakos, P., Lees, M., Krzhizhanovskaya, V.V., Dongarra, J.J., Sloot, P.M.A. (eds.) International Conference on Computational Science, ICCS 2017, 12\u201314 June 2017, Zurich, Switzerland. Procedia Computer Science, vol. 108, pp. 495\u2013504. Elsevier. https:\/\/doi.org\/10.1016\/J.PROCS.2017.05.138","DOI":"10.1016\/J.PROCS.2017.05.138"},{"key":"248_CR9","unstructured":"Dukhan, M.: The indirect convolution algorithm (2019). CoRR arXiv:1907.02129"},{"issue":"1","key":"248_CR10","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/s11390-020-0741-6","volume":"36","author":"J Fang","year":"2021","unstructured":"Fang, J., Liao, X., Huang, C., Dong, D.: Performance evaluation of memory-centric ARMV8 many-core architectures: a case study with Phytium 2000+. J. Comput. Sci. Technol. 36(1), 33\u201343 (2021). https:\/\/doi.org\/10.1007\/s11390-020-0741-6","journal-title":"J. Comput. Sci. Technol."},{"issue":"4","key":"248_CR11","doi-asserted-by":"publisher","first-page":"509","DOI":"10.1631\/FITEE.2200359","volume":"24","author":"J Fang","year":"2023","unstructured":"Fang, J., Zhang, P., Huang, C., Tang, T., Lu, K., Wang, R., Wang, Z.: Programming bare-metal accelerators with heterogeneous threading models: a case study of matrix-3000. Front. Inf. Technol. Electron. Eng. 24(4), 509\u2013520 (2023). https:\/\/doi.org\/10.1631\/FITEE.2200359","journal-title":"Front. Inf. Technol. Electron. Eng."},{"issue":"2","key":"248_CR12","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1145\/3378671","volume":"46","author":"G Frison","year":"2020","unstructured":"Frison, G., Sartor, T., Zanelli, A., Diehl, M.: The BLAS API of BLASFEO: optimizing performance for small matrices. ACM Trans. Math. Softw. 46(2), 15\u201311536 (2020). https:\/\/doi.org\/10.1145\/3378671","journal-title":"ACM Trans. Math. Softw."},{"key":"248_CR13","doi-asserted-by":"publisher","unstructured":"Fu, X., Yang, W., Dong, D., Su, X.: Optimizing attention by exploiting data reuse on ARM multi-core CPUs. In: Kise, K., Salapura, V., Annavaram, M., Varbanescu, A.L. (eds.) Proceedings of the 38th ACM International Conference on Supercomputing, ICS 2024, Kyoto, Japan, June 4\u20137, pp. 137\u2013149. ACM (2024). https:\/\/doi.org\/10.1145\/3650200.3656620","DOI":"10.1145\/3650200.3656620"},{"key":"248_CR14","doi-asserted-by":"publisher","unstructured":"Georganas, E., Banerjee, K., Kalamkar, D.D., Avancha, S., Venkat, A., Anderson, M.J., Henry, G., Pabst, H., Heinecke, A.: Harnessing deep learning via a single building block. In: 2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS), New Orleans, LA, USA, May 18\u201322, pp. 222\u2013233. IEEE (2020). https:\/\/doi.org\/10.1109\/IPDPS47924.2020.00032","DOI":"10.1109\/IPDPS47924.2020.00032"},{"issue":"3","key":"248_CR15","doi-asserted-by":"publisher","first-page":"12","DOI":"10.1145\/1356052.1356053","volume":"34","author":"K Goto","year":"2008","unstructured":"Goto, K., Geijn, R.A.: Anatomy of high-performance matrix multiplication. ACM Trans. Math. Softw. 34(3), 12\u201311225 (2008). https:\/\/doi.org\/10.1145\/1356052.1356053","journal-title":"ACM Trans. Math. Softw."},{"issue":"1","key":"248_CR16","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1145\/1377603.1377607","volume":"35","author":"K Goto","year":"2008","unstructured":"Goto, K., Geijn, R.A.: High-performance implementation of the level-3 BLAS. ACM Trans. Math. Softw. 35(1), 4\u20131414 (2008). https:\/\/doi.org\/10.1145\/1377603.1377607","journal-title":"ACM Trans. Math. Softw."},{"key":"248_CR17","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, June 27\u201330, pp. 770\u2013778. IEEE Computer Society (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"248_CR18","doi-asserted-by":"publisher","unstructured":"Heinecke, A., Henry, G., Hutchinson, M., Pabst, H.: LIBXSMM: accelerating small matrix multiplications by runtime code generation. In: West, J., Pancake, C.M. (eds.) Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2016, Salt Lake City, UT, USA, November 13\u201318, pp. 981\u2013991. IEEE Computer Society (2016). https:\/\/doi.org\/10.1109\/SC.2016.83","DOI":"10.1109\/SC.2016.83"},{"key":"248_CR19","doi-asserted-by":"publisher","unstructured":"Igual, F.D., Ali, M., Friedmann, A., Stotzer, E., Wentz, T., Geijn, R.A.: Unleashing the high-performance and low-power of multi-core DSPs for general-purpose HPC. In: Hollingsworth, J.K. (ed.) SC Conference on High Performance Computing Networking, Storage and Analysis, SC \u201912, Salt Lake City, UT, USA\u2014November 11\u201315, 2012, p. 26. IEEE\/ACM. https:\/\/doi.org\/10.1109\/SC.2012.109","DOI":"10.1109\/SC.2012.109"},{"key":"248_CR20","doi-asserted-by":"publisher","unstructured":"Jia, W., Wang, H., Chen, M., Lu, D., Lin, L., Car, R., E, W., Zhang, L.: Pushing the limit of molecular dynamics with ab initio accuracy to 100 million atoms with machine learning. In: Cuicchi, C., Qualters, I., Kramer, W.T. (eds.) Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2020, Virtual Event\/Atlanta, Georgia, USA, November 9\u201319, p. 5. IEEE\/ACM (2020). https:\/\/doi.org\/10.1109\/SC41405.2020.00009","DOI":"10.1109\/SC41405.2020.00009"},{"key":"248_CR21","doi-asserted-by":"publisher","unstructured":"Jiang, J., Du, J., Huang, D., Li, D., Zheng, J., Lu, Y.: Characterizing and optimizing transformer inference on ARM many-core processor. In: Proceedings of the 51st International Conference on Parallel Processing, ICPP 2022, Bordeaux, France, 29 August 2022\u20131 September 2022, pp. 20\u201312011. ACM (2022). https:\/\/doi.org\/10.1145\/3545008.3545022","DOI":"10.1145\/3545008.3545022"},{"key":"248_CR22","doi-asserted-by":"publisher","unstructured":"Kim, K., Costa, T.B., Deveci, M., Bradley, A.M., Hammond, S.D., Guney, M.E., Knepper, S., Story, S., Rajamanickam, S.: Designing vector-friendly compact BLAS and LAPACK kernels. In: Mohr, B., Raghavan, P. (eds.) Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2017, Denver, CO, USA, November 12\u201317, 2017, p. 55. ACM. https:\/\/doi.org\/10.1145\/3126908.3126941","DOI":"10.1145\/3126908.3126941"},{"issue":"3","key":"248_CR23","doi-asserted-by":"publisher","first-page":"580","DOI":"10.1109\/TPDS.2019.2939785","volume":"31","author":"H Lan","year":"2019","unstructured":"Lan, H., Meng, J., Hundt, C., Schmidt, B., Deng, M., Wang, X., Liu, W., Qiao, Y., Feng, S.: FeatherCNN: fast inference computation with TensorGEMM on ARM architectures. IEEE Trans. Parallel Distrib. Syst. 31(3), 580\u2013594 (2019). https:\/\/doi.org\/10.1109\/TPDS.2019.2939785","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"248_CR24","doi-asserted-by":"publisher","unstructured":"Li, X., Liang, Y., Yan, S., Jia, L., Li, Y.: A coordinated tiling and batching framework for efficient GEMM on gpus. In: Hollingsworth, J.K., Keidar, I. (eds.) Proceedings of the 24th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP 2019, Washington, DC, USA, February 16\u201320, pp. 229\u2013241. ACM (2019). https:\/\/doi.org\/10.1145\/3293883.3295734","DOI":"10.1145\/3293883.3295734"},{"issue":"10","key":"248_CR25","doi-asserted-by":"publisher","first-page":"2148","DOI":"10.1109\/TPDS.2019.2906891","volume":"30","author":"S Ma","year":"2019","unstructured":"Ma, S., Liu, Z., Chen, S., Huang, L., Guo, Y., Wang, Z., Zhang, M.: Coordinated DMA: improving the DRAM access efficiency for matrix multiplication. IEEE Trans. Parallel Distrib. Syst. 30(10), 2148\u20132164 (2019). https:\/\/doi.org\/10.1109\/TPDS.2019.2906891","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"248_CR26","unstructured":"Nek5000\/NekBox. https:\/\/github.com\/NekBox\/NekBox"},{"key":"248_CR27","doi-asserted-by":"publisher","unstructured":"Penuchot, J., Falcou, J., Khabou, A.: Modern generative programming for optimizing small matrix-vector multiplication. In: 2018 International Conference on High Performance Computing & Simulation, HPCS 2018, Orleans, France, July 16\u201320, 2018, pp. 508\u2013514. IEEE. https:\/\/doi.org\/10.1109\/HPCS.2018.00086","DOI":"10.1109\/HPCS.2018.00086"},{"key":"248_CR28","doi-asserted-by":"publisher","unstructured":"Smith, J.E.: Decoupled access\/execute computer architectures. In: Szygenda, S.A., Hughes, J., Blanton, M., Wagner, T.J., Frailey, D.J., Gunter, T., McLeavy, C., Lipovski, G.J., Malek, M. (eds.) 9th International Symposium on Computer Architecture (ISCA 1982), Austin, TX, USA, April 26\u201329, 1982, pp. 112\u2013119. IEEE Computer Society. https:\/\/doi.org\/10.5555\/800048.801719","DOI":"10.5555\/800048.801719"},{"key":"248_CR29","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., Polosukhin, I.: Attention is all you need. In: Guyon, I., Luxburg, U., Bengio, S., Wallach, H.M., Fergus, R., Vishwanathan, S.V.N., Garnett, R. (eds.) Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4\u20139, Long Beach, CA, USA, pp. 5998\u20136008 (2017). https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"248_CR30","doi-asserted-by":"publisher","DOI":"10.1177\/1094342019882246","author":"X Wang","year":"2020","unstructured":"Wang, X., Zhou, Z., Hu, C., Yang, W., Zhao, M., Wang, Z., Shi, P.: Accelerating and tuning small matrix multiplications on Sunway TaihuLight: a case study of spectral element CFD code NEK5000. Int. J. High Perform. Comput. Appl. (2020). https:\/\/doi.org\/10.1177\/1094342019882246","journal-title":"Int. J. High Perform. Comput. Appl."},{"issue":"1","key":"248_CR31","doi-asserted-by":"publisher","first-page":"114","DOI":"10.1007\/S42514-020-00057-2","volume":"3","author":"Y Wang","year":"2021","unstructured":"Wang, Y., Li, C., Liu, C., Liu, S., Lei, Y., Zhang, J., Zhang, Y., Guo, Y.: Advancing DSP into HPC, AI, and beyond: challenges, mechanisms, and future directions. CCF Trans. High Perform. Comput. 3(1), 114\u2013125 (2021). https:\/\/doi.org\/10.1007\/S42514-020-00057-2","journal-title":"CCF Trans. High Perform. Comput."},{"key":"248_CR32","doi-asserted-by":"publisher","unstructured":"Yang, W., Fang, J., Dong, D., Su, X., Wang, Z.: LIBSHALOM: optimizing small and irregular-shaped matrix multiplications on ARMV8 multi-cores. In: Supinski, B.R., Hall, M.W., Gamblin, T. (eds.) International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2021, St. Louis, Missouri, USA, November 14\u201319, 2021, p. 72. ACM. https:\/\/doi.org\/10.1145\/3458817.3476217","DOI":"10.1145\/3458817.3476217"},{"key":"248_CR33","doi-asserted-by":"publisher","first-page":"36413","DOI":"10.1109\/ACCESS.2019.2905302","volume":"7","author":"C Yang","year":"2019","unstructured":"Yang, C., Chen, S., Zhang, J., Lv, Z., Wang, Z.: A novel DSP architecture for scientific computing and deep learning. IEEE Access 7, 36413\u201336425 (2019). https:\/\/doi.org\/10.1109\/ACCESS.2019.2905302","journal-title":"IEEE Access"},{"issue":"5","key":"248_CR34","doi-asserted-by":"publisher","first-page":"503","DOI":"10.1007\/S42514-023-00175-7","volume":"6","author":"M Yang","year":"2024","unstructured":"Yang, M., Zhang, P., Fang, J., Liu, W., Huang, C.: thSORT: an efficient parallel sorting algorithm on multi-core DSPs. CCF Trans. High Perform. Comput. 6(5), 503\u2013518 (2024). https:\/\/doi.org\/10.1007\/S42514-023-00175-7","journal-title":"CCF Trans. High Perform. Comput."},{"key":"248_CR35","doi-asserted-by":"publisher","unstructured":"Yin, S., Wang, Q., Hao, R., Zhou, T., Mei, S., Liu, J.: Optimizing irregular-shaped matrix-matrix multiplication on multi-core DSPs. In: IEEE International Conference on Cluster Computing, CLUSTER 2022, Heidelberg, Germany, September 5\u20138, 2022, pp. 451\u2013461. IEEE. https:\/\/doi.org\/10.1109\/CLUSTER51413.2022.00055","DOI":"10.1109\/CLUSTER51413.2022.00055"},{"key":"248_CR36","doi-asserted-by":"publisher","unstructured":"Yu, K., Qi, X., Zhang, P., Fang, J., Dong, D., Wang, R., Tang, T., Huang, C., Che, Y., Wang, Z.: Optimizing general matrix multiplications on modern multi-core DSPs. In: IEEE International Parallel and Distributed Processing Symposium, IPDPS 2024, San Francisco, CA, USA, May 27\u201331, 2024, pp. 964\u2013975. IEEE. https:\/\/doi.org\/10.1109\/IPDPS57955.2024.00090","DOI":"10.1109\/IPDPS57955.2024.00090"},{"issue":"3","key":"248_CR37","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1145\/2764454","volume":"41","author":"FGV Zee","year":"2015","unstructured":"Zee, F.G.V., Geijn, R.A.: BLIS: a framework for rapidly instantiating BLAS functionality. ACM Trans. Math. Softw. 41(3), 14\u201311433 (2015). https:\/\/doi.org\/10.1145\/2764454","journal-title":"ACM Trans. Math. Softw."},{"key":"248_CR38","doi-asserted-by":"publisher","unstructured":"Zhang, Y., Wang, Y., Mo, Z., Zhou, Y., Sun, T., Xu, G., Xing, C., Yang, L.: Accelerating small matrix multiplications by adaptive batching strategy on GPU. In: 24th IEEE Int Conf on High Performance Computing & Communications; 8th Int Conf on Data Science & Systems; 20th Int Conf on Smart City; 8th Int Conf on Dependability in Sensor, Cloud & Big Data Systems & Application, HPCC\/DSS\/SmartCity\/DependSys 2022, Hainan, China, December 18\u201320, 2022, pp. 882\u2013887. IEEE. https:\/\/doi.org\/10.1109\/HPCC-DSS-SMARTCITY-DEPENDSYS57074.2022.00143","DOI":"10.1109\/HPCC-DSS-SMARTCITY-DEPENDSYS57074.2022.00143"},{"key":"248_CR39","doi-asserted-by":"publisher","unstructured":"Zhang, X., Wang, Q., Zhang, Y.: Model-driven level 3 BLAS performance optimization on Loongson 3A processor. In: 18th IEEE International Conference on Parallel and Distributed Systems, ICPADS 2012, Singapore, December 17\u201319, 2012, pp. 684\u2013691. IEEE Computer Society. https:\/\/doi.org\/10.1109\/ICPADS.2012.97","DOI":"10.1109\/ICPADS.2012.97"},{"key":"248_CR40","doi-asserted-by":"publisher","DOI":"10.1016\/J.SYSARC.2025.103341","volume":"160","author":"Y Zhang","year":"2025","unstructured":"Zhang, Y., Lu, L., Yang, Z., Liang, Z., Suo, S.: A load-balanced acceleration method for small and irregular batch matrix multiplication on GPU. J. Syst. Archit. 160, 103341 (2025). https:\/\/doi.org\/10.1016\/J.SYSARC.2025.103341","journal-title":"J. Syst. Archit."},{"key":"248_CR41","doi-asserted-by":"publisher","unstructured":"Zhu, F., Qi, X., Zhang, P., Fang, J., Tang, T., Che, Y., Yu, K., Xie, J., Huang, C., Ren, J.: Optimizing stencil computation on multi-core DSPs. In: Proceedings of the 53rd International Conference on Parallel Processing, ICPP 2024, Gotland, Sweden, August 12\u201315, 2024, pp. 679\u2013690. ACM. https:\/\/doi.org\/10.1145\/3673038.3673062","DOI":"10.1145\/3673038.3673062"}],"container-title":["CCF Transactions on High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-025-00248-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42514-025-00248-9","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-025-00248-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T08:56:02Z","timestamp":1770627362000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42514-025-00248-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,28]]},"references-count":41,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,2]]}},"alternative-id":["248"],"URL":"https:\/\/doi.org\/10.1007\/s42514-025-00248-9","relation":{},"ISSN":["2524-4922","2524-4930"],"issn-type":[{"value":"2524-4922","type":"print"},{"value":"2524-4930","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,28]]},"assertion":[{"value":"15 June 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 August 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 October 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}