{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,19]],"date-time":"2025-03-19T15:47:50Z","timestamp":1742399270537,"version":"3.37.3"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2019,5,13]],"date-time":"2019-05-13T00:00:00Z","timestamp":1557705600000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,5,13]],"date-time":"2019-05-13T00:00:00Z","timestamp":1557705600000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"National Key R&D Plan of China","award":["2016YFB0200603"],"award-info":[{"award-number":["2016YFB0200603"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["JQ18001"],"award-info":[{"award-number":["JQ18001"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Cluster Comput"],"published-print":{"date-parts":[[2020,6]]},"DOI":"10.1007\/s10586-019-02938-w","type":"journal-article","created":{"date-parts":[[2019,5,14]],"date-time":"2019-05-14T01:50:48Z","timestamp":1557798648000},"page":"493-507","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Solving a trillion unknowns per second with HPGMG on Sunway TaihuLight"],"prefix":"10.1007","volume":"23","author":[{"given":"Wenjing","family":"Ma","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yulong","family":"Ao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7426-6248","authenticated-orcid":false,"given":"Chao","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Samuel","family":"Williams","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,5,13]]},"reference":[{"key":"2938_CR1","doi-asserted-by":"crossref","unstructured":"Adams, M.F., Brown, J., Shalf, J., Straalen, B.V., Strohmaier, E., Williams, S.: HPGMG 1.0:A Benchmark for Ranking High Performance Computing Systems. Lawrence Berkeley National Lab, Berkeley (2014)","DOI":"10.2172\/1131029"},{"issue":"11","key":"2938_CR2","doi-asserted-by":"publisher","first-page":"5690","DOI":"10.1007\/s11227-016-1871-z","volume":"74","author":"M Aldinucci","year":"2018","unstructured":"Aldinucci, M., Danelutto, M., Drocco, M., Kilpatrick, P., Misale, C., Peretti Pezzi, G., Torquati, M.: A parallel pattern for iterative stencil + reduce. J. Supercomput. 74(11), 5690\u20135705 (2018). \nhttps:\/\/doi.org\/10.1007\/s11227-016-1871-z","journal-title":"J. Supercomput."},{"key":"2938_CR3","doi-asserted-by":"crossref","unstructured":"Ao, Y., Liu, Y., Yang, C., Liu, F., Zhang, P., Lu, Y., Du, Y.: Performance Evaluation of HPGMG on Tianhe-2: arly Experience, pp. 230\u2013243. Springer, Cham (2015)","DOI":"10.1007\/978-3-319-27140-8_17"},{"key":"2938_CR4","unstructured":"Ao, Y., Yang, C., Wang, X., Xue, W., Fu, H., Liu, F., Gan, L., Xu, P., Ma, W.: 26 PFLOPS stencil computations for atmospheric modeling on sunway TaihuLight. In: 2017 IEEE International Parallel and Distributed Processing Symposium, IPDPS 2017, Orlando, May 29\u2013June 2, 2017, pp. 535\u2013544 (2017)"},{"key":"2938_CR5","unstructured":"Basu, P., Hall, M., Williams, S., Straalen, B.V., Oliker, L., Colella, P.: In: 2015 IEEE International Parallel and Distributed Processing Symposium"},{"key":"2938_CR6","doi-asserted-by":"crossref","unstructured":"Basu, P., Hall, M., Williams, S., Van Straalen, B., Oliker, L.: Converting Stencils to Accumulations for Communication-Avoiding Optimization in Geometric Multigrid, pp. 9\u201316. Association for Computing Machinery, Inc (2014)","DOI":"10.1145\/2686745.2686749"},{"key":"2938_CR7","doi-asserted-by":"crossref","unstructured":"Basu, P., Venkat, A., Hall, M., Williams, S., Van Straalen, B., Oliker, L.: Compiler Generation and Autotuning of Communication-Avoiding Operators for Geometric Multigrid. IEEE Computer Society (2013)","DOI":"10.1109\/HiPC.2013.6799131"},{"issue":"C","key":"2938_CR8","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1016\/j.parco.2017.04.002","volume":"64","author":"P Basu","year":"2017","unstructured":"Basu, P., Williams, S., Van Straalen, B., Oliker, L., Colella, P., Hall, M.: Compiler-based code generation and autotuning for geometric multigrid on GPU-accelerated supercomputers. Parallel Comput. 64(C), 50\u201364 (2017)","journal-title":"Parallel Comput."},{"issue":"2","key":"2938_CR9","doi-asserted-by":"publisher","first-page":"255","DOI":"10.1007\/s10586-013-0332-1","volume":"17","author":"W Cao","year":"2014","unstructured":"Cao, W., Xu, C.F., Wang, Z.H., Yao, L., Liu, H.Y.: Cpu\/gpu computing for a multi-block structured grid based high-order flow solver on a large heterogeneous system. Clust. Comput. 17(2), 255\u2013270 (2014). \nhttps:\/\/doi.org\/10.1007\/s10586-013-0332-1","journal-title":"Clust. Comput."},{"key":"2938_CR10","unstructured":"Datta, K., Murphy, M., Volkov, V., Williams, S., Carter, J., Oliker, L., Patterson, D., Shalf, J., Yelick, K.: Stencil computation optimization and auto-tuning on state-of-the-art multicore architectures. In: Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing, SC \u201908, pp. 4:1\u20134:12. IEEE Press, Piscataway (2008). \nhttp:\/\/dl.acm.org\/citation.cfm?id=1413370.1413375"},{"key":"2938_CR11","doi-asserted-by":"publisher","DOI":"10.1201\/b10376-18","volume-title":"Auto-tuning Stencil Computations on Multicore and Accelerators","author":"K Datta","year":"2010","unstructured":"Datta, K., Williams, S., Volkov, V., Carter, J., Oliker, L., Shalf, J., Yelick, K.: Auto-tuning Stencil Computations on Multicore and Accelerators. CRC Press, Boca Raton (2010)"},{"key":"2938_CR12","doi-asserted-by":"publisher","unstructured":"Dong, W., Kang, L., Quan, Z., Li, K., Li, K., Hao, Z., Xie, X.H.: Implementing molecular dynamics simulation on sunway TaihuLight system. In: 2016 IEEE 18th International Conference on High Performance Computing and Communications; IEEE 14th International Conference on Smart City; IEEE 2nd International Conference on Data Science and Systems (HPCC\/SmartCity\/DSS), pp. 443\u2013450 (2016). \nhttps:\/\/doi.org\/10.1109\/HPCC-SmartCity-DSS.2016.0070","DOI":"10.1109\/HPCC-SmartCity-DSS.2016.0070"},{"key":"2938_CR13","unstructured":"Dongarra, J.: Confessions of an accidental benchmarker. \nhttp:\/\/sc13.supercomputing.org\/sites\/default\/files\/WorkshopsArchive\/pdfs\/wp156s1.pdf"},{"key":"2938_CR14","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1177\/1094342015593158","volume":"30","author":"J Dongarra","year":"2015","unstructured":"Dongarra, J., Heroux, M.A., Luszczek, P.: High-performance conjugate-gradient benchmark: a new metric for ranking high-performance computing systems. Int. J. High Perform. Comput. Appl. 30, 3\u201310 (2015). \nhttps:\/\/doi.org\/10.1177\/1094342015593158","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"2938_CR15","doi-asserted-by":"publisher","first-page":"803","DOI":"10.1002\/cpe.728","volume":"15","author":"JJ Dongarra","year":"2003","unstructured":"Dongarra, J.J., Luszczek, P., Petitet, A.: The LINPACK benchmark: past, present and future. Concurr. Comput. 15, 803\u2013820 (2003). \nhttps:\/\/doi.org\/10.1002\/cpe.728","journal-title":"Concurr. Comput."},{"key":"2938_CR16","doi-asserted-by":"crossref","unstructured":"Fu, H., He, C., Chen, B., Yin, Z., Zhang, Z., Zhang, W., Zhang, T., Xue, W., Liu, W., Yin, W., Yang, G., Chen, X.: 18.9Pflopss nonlinear earthquake simulation on sunway TaihuLight: enabling depiction of 18-Hz and 8-meter scenarios. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC \u201917, pp. 2:1\u20132:12. ACM, New York (2017)","DOI":"10.1145\/3126908.3126910"},{"key":"2938_CR17","doi-asserted-by":"crossref","unstructured":"Fu, H., Liao, J., Ding, N., Duan, X., Gan, L., Liang, Y., Wang, X., Yang, J., Zheng, Y., Liu, W., Wang, L., Yang, G.: Redesigning CAM-SE for peta-scale climate modeling performance and ultra-high resolution on sunway TaihuLight. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC \u201917, pp. 1:1\u20131:12. ACM, New York (2017)","DOI":"10.1145\/3126908.3126909"},{"key":"2938_CR18","doi-asserted-by":"publisher","first-page":"072001","DOI":"10.1007\/s11432-016-5588-7","volume":"59","author":"H Fu","year":"2016","unstructured":"Fu, H., Liao, J., Yang, J., Wang, L., Song, Z., Huang, X., Yang, C., Xue, W., Liu, F., Qiao, F., Zhao, W., Yin, X., Hou, C., Zhang, C., Ge, W., Zhang, J., Wang, Y., Zhou, C., Yang, G.: The sunway TaihuLight supercomputer: system and applications. Sci. China Inf. Sci. 59, 072001 (2016). \nhttps:\/\/doi.org\/10.1007\/s11432-016-5588-7","journal-title":"Sci. China Inf. Sci."},{"key":"2938_CR19","doi-asserted-by":"crossref","unstructured":"Hagedorn, B., Stoltzfus, L., Steuwer, M., Gorlatch, S., Dubach, C.: High performance stencil code generation with lift. In: CGO. ACM, pp. 100\u2013112 (2018)","DOI":"10.1145\/3168824"},{"key":"2938_CR20","doi-asserted-by":"crossref","unstructured":"Holewinski, J., Pouchet, L.N., Sadayappan, P.: High-performance code generation for stencil computations on GPU architectures. In: Proceedings of the 26th ACM International Conference on Supercomputing, ICS \u201912, pp. 311\u2013320. ACM, New York (2012)","DOI":"10.1145\/2304576.2304619"},{"key":"2938_CR21","unstructured":"https:\/\/graph500.org\n\n (2017)"},{"key":"2938_CR22","doi-asserted-by":"crossref","unstructured":"Jiang, L., Yang, C., Ao, Y., Ma, W.: Towards highly efficient DGEMM on the emerging SW26010 many-core processor. In: The 46th International Conference on Parallel Processing\u2019 (2017)","DOI":"10.1109\/ICPP.2017.51"},{"key":"2938_CR23","doi-asserted-by":"crossref","unstructured":"K\u00f6stler, H., Feichtinger, C., R\u00fcde, U., Aoki, T.: A Geometric Multigrid Solver on Tsubame 2.0, pp. 155\u2013173. Springer Berlin Heidelberg, Berlin, Heidelberg (2014)","DOI":"10.1007\/978-3-642-54774-4_8"},{"key":"2938_CR24","doi-asserted-by":"crossref","unstructured":"K\u00f6stler, H., Ritter, D., Feichtinger, C.: A Geometric Multigrid Solver on GPU Clusters, pp. 407\u2013422. Springer Berlin Heidelberg, Berlin, Heidelberg (2013)","DOI":"10.1007\/978-3-642-16405-7_26"},{"key":"2938_CR25","doi-asserted-by":"crossref","unstructured":"Kwack, J., Bauer, G.H.: HPCG and HPGMG Benchmark Tests on Multiple Program, Multiple Data (MPMD) Mode on Blue Waters\u2014A Cray XE6\/XK7 Hybrid System. \nhttps:\/\/cug.org\/proceedings\/cug2017_proceedings\/includes\/files\/pap118s2-file1.pdf\n\n (2017)","DOI":"10.1002\/cpe.4298"},{"issue":"6","key":"2938_CR26","doi-asserted-by":"publisher","first-page":"1262","DOI":"10.1007\/s11390-016-1696-5","volume":"31","author":"W Ma","year":"2016","unstructured":"Ma, W., Gao, K., Long, G.: Highly optimized code generation for stencil codes with computation reuse for GPUs. J. Comput. Sci. Technol. 31(6), 1262\u20131274 (2016)","journal-title":"J. Comput. Sci. Technol."},{"key":"2938_CR27","unstructured":"Maruyama, N., Aoki, T.: Optimizing Stencil Computations for nvidia kepler gpus (2014)"},{"key":"2938_CR28","unstructured":"Meuer, H., Strohmaier, E., Dongarra, J., Simon, H., Martin, M.: Top 500 Supercomputer Lists (2016). \nhttp:\/\/www.top500.org"},{"key":"2938_CR29","doi-asserted-by":"crossref","unstructured":"Nguyen, A., Satish, N., Chhugani, J., Kim, C., Dubey, P.: 3.5-d blocking optimization for stencil computations on modern cpus and gpus. In: 2010 ACM\/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201313 (2010)","DOI":"10.1109\/SC.2010.2"},{"key":"2938_CR30","unstructured":"Qiao, F., Zhao, W., Yin, X., Huang, X., Liu, X., Shu, Q., Wang, G., Song, Z., Li, X., Liu, H., Yang, G., Yuan, Y.: A highly effective global surface wave numerical simulation with ultra-high resolution. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC \u201916, pp. 5:1\u20135:11. IEEE Press, Piscataway (2016). \nhttp:\/\/dl.acm.org\/citation.cfm?id=3014904.3014911"},{"key":"2938_CR31","unstructured":"Sakharnykh, N.: \nhttps:\/\/github.com\/e-ago\/hpgmg-cuda-async\n\n (2016)"},{"key":"2938_CR32","unstructured":"Sakharnykh, N.: Beyond GPU Memory Limits with Unified Memory on Pascal. \nhttps:\/\/devblogs.nvidia.com\/parallelforall\/beyond-gpu-memory-limits-unified-memory-pascal\/\n\n (2016)"},{"key":"2938_CR33","doi-asserted-by":"crossref","unstructured":"Stock, K., Kong, M., Grosser, T., Pouchet, L.N., Rastello, F., Ramanujam, J., Sadayappan, P.: A framework for enhancing data reuse via associative reordering. In: Proceedings of the 35th ACM SIGPLAN Conference on Programming Language Design and Implementation, PLDI \u201914, pp. 65\u201376. ACM, New York (2014)","DOI":"10.1145\/2594291.2594342"},{"key":"2938_CR34","doi-asserted-by":"crossref","unstructured":"Tan, G., Li, L., Triechle, S., Phillips, E., Bao, Y., Sun, N.: Fast implementation of DGEMM on fermi GPU. In: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, p.\u00a035. ACM (2011)","DOI":"10.1145\/2063384.2063431"},{"key":"2938_CR35","unstructured":"Williams, S.: Hpgmg. \nhttps:\/\/crd.lbl.gov\/assets\/pubs_presos\/HPGMG-FV-FF2-Proxy-App.pdf"},{"key":"2938_CR36","unstructured":"Williams, S., Kalamkar, D.D., Singh, A., Deshpande, A.M., Straalen, B.V., Smelyanskiy, M., Almgren, A., Dubey, P., Shalf, J., Oliker, L.: Optimization of geometric multigrid for emerging multi- and manycore processors. In: High Performance Computing, Networking, Storage and Analysis (SC), 2012 International Conference for, pp. 1\u201311 (2012)"},{"key":"2938_CR37","doi-asserted-by":"crossref","unstructured":"Williams, S., Shalf, J., Oliker, L., Kamil, S., Husbands, P., Yelick, K.: The potential of the cell processor for scientific computing. In: Proceedings of the 3rd Conference on Computing Frontiers, CF \u201906, pp. 9\u201320. ACM, New York (2006)","DOI":"10.1145\/1128022.1128027"},{"key":"2938_CR38","doi-asserted-by":"crossref","unstructured":"Yang, C., Xue, W., Fu, H., You, H., Wang, X., Ao, Y., Liu, F., Gan, L., Xu, P., Wang, L., Yang, G., Zheng, W.: 10M-core scalable fully-implicit solver for nonhydrostatic atmospheric dynamics. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC \u201916, pp. 6:1\u20136:12. IEEE Press, Piscataway (2016)","DOI":"10.1109\/SC.2016.5"},{"key":"2938_CR39","doi-asserted-by":"crossref","unstructured":"Zhang, J., Zhou, C., Wang, Y., Ju, L., Du, Q., Chi, X., Xu, D., Chen, D., Liu, Y., Liu, Z.: Extreme-scale phase field simulations of coarsening dynamics on the sunway TaihuLight supercomputer. In: SC16: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 34\u201345 (2016)","DOI":"10.1109\/SC.2016.3"},{"key":"2938_CR40","unstructured":"Zhang, Y., Mueller, F.: Auto-generation and auto-tuning of 3D stencil codes on GPU clusters. In: 10th Annual IEEE\/ACM International Symposium on Code Generation and Optimization, CGO 2012, San Jose, March 31\u2013 April 04, 2012, pp. 155\u2013164 (2012)"}],"container-title":["Cluster Computing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-019-02938-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10586-019-02938-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-019-02938-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,7,18]],"date-time":"2020-07-18T18:10:57Z","timestamp":1595095857000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10586-019-02938-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,5,13]]},"references-count":40,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2020,6]]}},"alternative-id":["2938"],"URL":"https:\/\/doi.org\/10.1007\/s10586-019-02938-w","relation":{},"ISSN":["1386-7857","1573-7543"],"issn-type":[{"type":"print","value":"1386-7857"},{"type":"electronic","value":"1573-7543"}],"subject":[],"published":{"date-parts":[[2019,5,13]]},"assertion":[{"value":"31 March 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 April 2019","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 May 2019","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 May 2019","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}