{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T23:42:21Z","timestamp":1740181341526,"version":"3.37.3"},"reference-count":36,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2023,5,17]],"date-time":"2023-05-17T00:00:00Z","timestamp":1684281600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,5,17]],"date-time":"2023-05-17T00:00:00Z","timestamp":1684281600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022ZD0117805"],"award-info":[{"award-number":["2022ZD0117805"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072018","U22A2028"],"award-info":[{"award-number":["62072018","U22A2028"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Iluvatar CoreX semiconductor Co., Ltd"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. HPC"],"published-print":{"date-parts":[[2023,9]]},"DOI":"10.1007\/s42514-023-00147-x","type":"journal-article","created":{"date-parts":[[2023,5,17]],"date-time":"2023-05-17T07:01:59Z","timestamp":1684306919000},"page":"322-333","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Adapting combined tiling to stencil optimizations on sunway processor"],"prefix":"10.1007","volume":"5","author":[{"given":"Biao","family":"Sun","sequence":"first","affiliation":[]},{"given":"Mingzhen","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1101-7927","authenticated-orcid":false,"given":"Hailong","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Jun","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Zhongzhi","family":"Luan","sequence":"additional","affiliation":[]},{"given":"Depei","family":"Qian","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,5,17]]},"reference":[{"key":"147_CR1","doi-asserted-by":"crossref","unstructured":"Ahmad, Z., Chowdhury, R., Das, R., Ganapathi, P., Gregory, A., Zhu, Y.: Fast stencil computations using fast fourier transforms. In: Proceedings of the 33rd ACM Symposium on Parallelism in Algorithms and Architectures, pp. 8\u201321 (2021)","DOI":"10.1145\/3409964.3461803"},{"key":"147_CR2","doi-asserted-by":"publisher","unstructured":"Ao, Y., Yang, C., Wang, X., Xue, W., Fu, H., Liu, F., Gan, L., Xu, P., Ma, W.: 26 pflops stencil computations for atmospheric modeling on sunway taihulight. In: 2017 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 535\u2013544 (2017). https:\/\/doi.org\/10.1109\/IPDPS.2017.9","DOI":"10.1109\/IPDPS.2017.9"},{"key":"147_CR3","doi-asserted-by":"crossref","unstructured":"Bertolacci, I.J., Olschanowsky, C., Harshbarger, B., Chamberlain, B.L., Wonnacott, D.G., Strout, M.M.: Parameterized diamond tiling for stencil computations with chapel parallel iterators. In: Proceedings of the 29th ACM on International Conference on Supercomputing, pp. 197\u2013206 (2015)","DOI":"10.1145\/2751205.2751226"},{"key":"147_CR4","doi-asserted-by":"publisher","unstructured":"Cai, Y., Yang, C., Ma, W., Ao, Y.: Extreme-scale realistic stencil computations on sunway taihulight with ten million cores. In: 2018 18th IEEE\/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGRID), pp. 566\u2013571 (2018). https:\/\/doi.org\/10.1109\/CCGRID.2018.00086","DOI":"10.1109\/CCGRID.2018.00086"},{"issue":"4","key":"147_CR5","doi-asserted-by":"publisher","first-page":"923","DOI":"10.1109\/TPDS.2018.2871189","volume":"30","author":"Y Chen","year":"2019","unstructured":"Chen, Y., Li, K., Yang, W., Xiao, G., Xie, X., Li, T.: Performance-aware model for sparse matrix-matrix multiplication on the sunway taihulight supercomputer. IEEE Trans. Parallel Distrib. Syst. 30(4), 923\u2013938 (2019). https:\/\/doi.org\/10.1109\/TPDS.2018.2871189","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"issue":"10","key":"147_CR6","doi-asserted-by":"publisher","first-page":"2329","DOI":"10.1109\/TPDS.2020.2990429","volume":"31","author":"Y Chen","year":"2020","unstructured":"Chen, Y., Xiao, G., \u00d6zsu, M.T., Liu, C., Zomaya, A.Y., Li, T.: AESPTV: an adaptive and efficient framework for sparse tensor-vector product kernel on a high-performance computing platform. IEEE Trans. Parallel Distrib. Syst. 31(10), 2329\u20132345 (2020). https:\/\/doi.org\/10.1109\/TPDS.2020.2990429","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"147_CR7","doi-asserted-by":"crossref","unstructured":"Dongarra, J., Peterson, G., Tomov, S., Allred, J., Natoli, V., Richie, D.: Exploring new architectures in accelerating cfd for air force applications. In: 2008 DoD HPCMP Users Group Conference, pp. 472\u2013478. IEEE (2008)","DOI":"10.1109\/DoD.HPCMP.UGC.2008.12"},{"key":"147_CR8","doi-asserted-by":"crossref","unstructured":"Frigo, M., Strumpen, V.: Cache oblivious stencil computations. In: Proceedings of the 19th Annual International Conference on Supercomputing, pp. 361\u2013366 (2005)","DOI":"10.1145\/1088149.1088197"},{"key":"147_CR9","doi-asserted-by":"crossref","unstructured":"Fu, H., He, C., Chen, B., Yin, Z., Zhang, Z., Zhang, W., Zhang, T., Xue, W., Liu, W., Yin, W., et al.: 9-pflops nonlinear earthquake simulation on sunway taihulight: enabling depiction of 18-hz and 8-meter scenarios. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201312 (2017)","DOI":"10.1145\/3126908.3126910"},{"key":"147_CR10","doi-asserted-by":"crossref","unstructured":"Garvey, J.D., Abdelrahman, T.S.: Automatic performance tuning of stencil computations on gpus. In: 2015 44th International Conference on Parallel Processing, pp. 300\u2013309. IEEE (2015)","DOI":"10.1109\/ICPP.2015.39"},{"issue":"1","key":"147_CR11","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1002\/cpe.1340","volume":"21","author":"J Guo","year":"2009","unstructured":"Guo, J., Bikshandi, G., Fraguela, B.B., Padua, D.: Writing productive stencil codes with overlapped tiling. Concurr. Comput. Pract. Exp. 21(1), 25\u201339 (2009)","journal-title":"Concurr. Comput. Pract. Exp."},{"key":"147_CR12","unstructured":"Habich, J., Zeiser, T., Hager, G., Wellein, G.: Enabling temporal blocking for a lattice Boltzmann flow solver through multicore-aware wavefront parallelization. In: 21st International Conference on Parallel Computational Fluid Dynamics, pp. 178\u2013182 (2009)"},{"key":"147_CR13","doi-asserted-by":"publisher","unstructured":"Jiang, L., Yang, C., Ao, Y., Yin, W., Ma, W., Sun, Q., Liu, F., Lin, R., Zhang, P.: Towards highly efficient dgemm on the emerging sw26010 many-core processor. In: 2017 46th International Conference on Parallel Processing (ICPP), pp. 422\u2013431 (2017). https:\/\/doi.org\/10.1109\/ICPP.2017.51","DOI":"10.1109\/ICPP.2017.51"},{"key":"147_CR14","doi-asserted-by":"publisher","unstructured":"Li, L., Fang, J., Fu, H., Jiang, J., Zhao, W., He, C., You, X., Yang, G.: swcaffe: A parallel framework for accelerating deep learning applications on sunway taihulight. In: 2018 IEEE International Conference on Cluster Computing (CLUSTER), pp. 413\u2013422 (2018). https:\/\/doi.org\/10.1109\/CLUSTER.2018.00087","DOI":"10.1109\/CLUSTER.2018.00087"},{"key":"147_CR15","doi-asserted-by":"crossref","unstructured":"Li, M., Liu, Y., Yang, H., Hu, Y., Sun, Q., Chen, B., You, X., Liu, X., Luan, Z., Qian, D.: Automatic code generation and optimization of large-scale stencil computation on many-core processors. In: 50th International Conference on Parallel Processing, pp. 1\u201312 (2021)","DOI":"10.1145\/3472456.3473517"},{"key":"147_CR17","doi-asserted-by":"crossref","unstructured":"Li, K., Yuan, L., Zhang, Y., Yue, Y.: Reducing redundancy in data organization and arithmetic calculation for stencil computations. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201315 (2021)","DOI":"10.1145\/3458817.3476154"},{"key":"147_CR16","doi-asserted-by":"crossref","unstructured":"Li, K., Yuan, L., Zhang, Y., Yue, Y., Cao, H.: An efficient vectorization scheme for stencil computation. In: 2022 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 650\u2013660. IEEE (2022)","DOI":"10.1109\/IPDPS53621.2022.00069"},{"key":"147_CR18","doi-asserted-by":"publisher","unstructured":"Liu, C., Xie, B., Liu, X., Xue, W., Yang, H., Liu, X.: Towards efficient spmv on sunway manycore architectures. In: Proceedings of the 2018 International Conference on Supercomputing. ICS \u201918, pp. 363\u2013373. Association for Computing Machinery, New York, NY, USA (2018). https:\/\/doi.org\/10.1145\/3205289.3205313","DOI":"10.1145\/3205289.3205313"},{"key":"147_CR19","doi-asserted-by":"publisher","first-page":"386","DOI":"10.1007\/978-3-030-60245-1_27","volume-title":"Algorithms and Architectures for Parallel Processing","author":"Y Liu","year":"2020","unstructured":"Liu, Y., Liu, L., Hu, M., Wang, W., Xue, W., Zhu, Q.: Performance modeling of stencil computation on sw26010 processors. In: Qiu, M. (ed.) Algorithms and Architectures for Parallel Processing, pp. 386\u2013400. Springer, Cham (2020)"},{"key":"147_CR20","doi-asserted-by":"crossref","unstructured":"Matsumura, K., Zohouri, H.R., Wahib, M., Endo, T., Matsuoka, S.: An5d: automated stencil framework for high-degree temporal blocking on gpus. In: Proceedings of the 18th ACM\/IEEE International Symposium on Code Generation and Optimization, pp. 199\u2013211 (2020)","DOI":"10.1145\/3368826.3377904"},{"key":"147_CR21","doi-asserted-by":"crossref","unstructured":"Micikevicius, P.: 3d finite difference computation on gpus using cuda. In: Proceedings of 2nd Workshop on General Purpose Processing on Graphics Processing Units, pp. 79\u201384 (2009)","DOI":"10.1145\/1513895.1513905"},{"key":"147_CR22","doi-asserted-by":"crossref","unstructured":"Mostafazadeh, B., Marti, F., Liu, F., Chandramowlishwaran, A.: Roofline guided design and analysis of a multi-stencil cfd solver for multicore performance. In: 2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 753\u2013762. IEEE (2018)","DOI":"10.1109\/IPDPS.2018.00085"},{"key":"147_CR23","doi-asserted-by":"publisher","unstructured":"Nguyen, A., Satish, N., Chhugani, J., Kim, C., Dubey, P.: 3.5-d blocking optimization for stencil computations on modern cpus and gpus. In: SC \u201910: Proceedings of the 2010 ACM\/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201313 (2010). https:\/\/doi.org\/10.1109\/SC.2010.2","DOI":"10.1109\/SC.2010.2"},{"issue":"8","key":"147_CR24","doi-asserted-by":"publisher","first-page":"1717","DOI":"10.1175\/BAMS-D-15-00308.1","volume":"98","author":"JG Powers","year":"2017","unstructured":"Powers, J.G., Klemp, J.B., Skamarock, W.C., Davis, C.A., Dudhia, J., Gill, D.O., Coen, J.L., Gochis, D.J., Ahmadov, R., Peckham, S.E., et al.: The weather research and forecasting model: overview, system efforts, and future directions. Bull. Am. Meteor. Soc. 98(8), 1717\u20131737 (2017)","journal-title":"Bull. Am. Meteor. Soc."},{"issue":"11","key":"147_CR26","doi-asserted-by":"publisher","first-page":"1902","DOI":"10.1109\/JPROC.2018.2862896","volume":"106","author":"PS Rawat","year":"2018","unstructured":"Rawat, P.S., Vaidya, M., Sukumaran-Rajam, A., Ravishankar, M., Grover, V., Rountev, A., Pouchet, L.-N., Sadayappan, P.: Domain-specific optimization and generation of high-performance gpu code for stencil computations. Proc. IEEE 106(11), 1902\u20131920 (2018)","journal-title":"Proc. IEEE"},{"key":"147_CR25","doi-asserted-by":"crossref","unstructured":"Rawat, P.S., Vaidya, M., Sukumaran-Rajam, A., Rountev, A., Pouchet, L.-N., Sadayappan, P.: On optimizing complex stencils on gpus. In: 2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 641\u2013652. IEEE (2019)","DOI":"10.1109\/IPDPS.2019.00073"},{"key":"147_CR27","doi-asserted-by":"crossref","unstructured":"Rivera, G., Tseng, C.-W.: Tiling optimizations for 3d scientific computations. In: SC\u201900: Proceedings of the 2000 ACM\/IEEE Conference on Supercomputing, p. 32. IEEE (2000)","DOI":"10.1109\/SC.2000.10015"},{"key":"147_CR28","doi-asserted-by":"crossref","unstructured":"Sun, Q., Liu, Y., Yang, H., Jiang, Z., Liu, X., Dun, M., Luan, Z., Qian, D.: cstuner: Scalable auto-tuning framework for complex stencil computation on gpus. In: 2021 IEEE International Conference on Cluster Computing (CLUSTER) (2021)","DOI":"10.1109\/Cluster48925.2021.00037"},{"key":"147_CR29","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1007\/978-981-15-8083-3_13","volume-title":"Artificial Intelligence and Security","author":"Y Tang","year":"2020","unstructured":"Tang, Y., Li, M., Chen, Z., Xue, C., Zhao, C., Yang, H.: Parallel optimization of stencil computation base on sunway taihulight. In: Sun, X., Wang, J., Bertino, E. (eds.) Artificial Intelligence and Security, pp. 141\u2013152. Springer, Singapore (2020)"},{"key":"147_CR30","doi-asserted-by":"crossref","unstructured":"Wellein, G., Hager, G., Zeiser, T., Wittmann, M., Fehske, H.: Efficient temporal blocking for stencil computations by multicore-aware wavefront parallelization. In: 2009 33rd Annual IEEE International Computer Software and Applications Conference, vol. 1, pp. 579\u2013586. IEEE (2009)","DOI":"10.1109\/COMPSAC.2009.82"},{"key":"147_CR31","doi-asserted-by":"publisher","unstructured":"Xu, Z., Lin, J., Matsuoka, S.: Benchmarking sw26010 many-core processor. In: 2017 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW), pp. 743\u2013752 (2017). https:\/\/doi.org\/10.1109\/IPDPSW.2017.9","DOI":"10.1109\/IPDPSW.2017.9"},{"key":"147_CR32","doi-asserted-by":"crossref","unstructured":"Xu, S., Xu, Y., Xue, W., Shen, X., Zheng, F., Huang, X., Yang, G.: Taming the\u201c monster\u201d: Overcoming program optimization challenges on sw26010 through precise performance modeling. In: 2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 763\u2013773. IEEE (2018)","DOI":"10.1109\/IPDPS.2018.00086"},{"key":"147_CR33","doi-asserted-by":"crossref","unstructured":"Yang, C., Xue, W., Fu, H., You, H., Wang, X., Ao, Y., Liu, F., Gan, L., Xu, P., Wang, L., et al.: 10m-core scalable fully-implicit solver for nonhydrostatic atmospheric dynamics. In: SC\u201916: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 57\u201368. IEEE (2016)","DOI":"10.1109\/SC.2016.5"},{"key":"147_CR34","doi-asserted-by":"crossref","unstructured":"Yount, C.R., Tobin, J., Breuer, A., Duran, A.: Yask-yet another stencil kernel: A framework for hpc stencil code-generation and tuning. 2016 Sixth International Workshop on Domain-Specific Languages and High-Level Frameworks for High Performance Computing (WOLFHPC), pp. 30\u201339 (2016)","DOI":"10.1109\/WOLFHPC.2016.08"},{"key":"147_CR36","doi-asserted-by":"crossref","unstructured":"Yuan, L., Zhang, Y., Guo, P., Huang, S.: Tessellating stencils. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201313 (2017)","DOI":"10.1145\/3126908.3126920"},{"key":"147_CR35","doi-asserted-by":"publisher","unstructured":"Yuan, L., Cao, H., Zhang, Y., Li, K., Lu, P., Yue, Y.: Temporal vectorization for stencils. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. SC \u201921. Association for Computing Machinery, New York, NY, USA (2021). https:\/\/doi.org\/10.1145\/3458817.3476149","DOI":"10.1145\/3458817.3476149"}],"container-title":["CCF Transactions on High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-023-00147-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42514-023-00147-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-023-00147-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,4]],"date-time":"2023-12-04T08:06:53Z","timestamp":1701677213000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42514-023-00147-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,17]]},"references-count":36,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2023,9]]}},"alternative-id":["147"],"URL":"https:\/\/doi.org\/10.1007\/s42514-023-00147-x","relation":{},"ISSN":["2524-4922","2524-4930"],"issn-type":[{"type":"print","value":"2524-4922"},{"type":"electronic","value":"2524-4930"}],"subject":[],"published":{"date-parts":[[2023,5,17]]},"assertion":[{"value":"5 March 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 April 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 May 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declared that they have no conflicts of interest to this work.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}