{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:40:09Z","timestamp":1740123609153,"version":"3.37.3"},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2019,9,5]],"date-time":"2019-09-05T00:00:00Z","timestamp":1567641600000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,9,5]],"date-time":"2019-09-05T00:00:00Z","timestamp":1567641600000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2019,12]]},"DOI":"10.1007\/s11227-019-02983-7","type":"journal-article","created":{"date-parts":[[2019,9,6]],"date-time":"2019-09-06T16:33:32Z","timestamp":1567787612000},"page":"8115-8146","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Implementation and performance evaluation of a communication-avoiding GMRES method for stencil-based code on GPU cluster"],"prefix":"10.1007","volume":"75","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5858-1598","authenticated-orcid":false,"given":"Kazuya","family":"Matsumoto","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yasuhiro","family":"Idomura","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Takuya","family":"Ina","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Akie","family":"Mayumi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Susumu","family":"Yamada","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,9,5]]},"reference":[{"key":"2983_CR1","doi-asserted-by":"crossref","unstructured":"Abdelfattah A, Haidar A, Tomov S, Dongarra J (2016) Performance, design, and autotuning of batched GEMM for GPUs. In: Proceedings of the ISC High Performance Computing 2016, LNCS, vol 9697, pp 21\u201338. Springer","DOI":"10.1007\/978-3-319-41321-1_2"},{"issue":"7","key":"2983_CR2","doi-asserted-by":"publisher","first-page":"1974","DOI":"10.1109\/TPDS.2016.2633349","volume":"28","author":"Y Asahi","year":"2017","unstructured":"Asahi Y, Latu G, Ina T, Idomura Y, Grandgirard V, Garbet X (2017) Optimization of fusion kernels on accelerators with indirect or strided memory access patterns. IEEE Trans Parallel Distrib Syst 28(7):1974\u20131988. https:\/\/doi.org\/10.1109\/TPDS.2016.2633349","journal-title":"IEEE Trans Parallel Distrib Syst"},{"issue":"4","key":"2983_CR3","doi-asserted-by":"publisher","first-page":"563","DOI":"10.1093\/imanum\/14.4.563","volume":"14","author":"Z Bai","year":"1994","unstructured":"Bai Z, Hu D, Reichel L (1994) A Newton basis GMRES implementation. IMA J Numer Anal 14(4):563\u2013581. https:\/\/doi.org\/10.1093\/imanum\/14.4.563","journal-title":"IMA J Numer Anal"},{"key":"2983_CR4","unstructured":"Carson E (2015) Communication-avoiding Krylov subspace methods in theory and practice. PhD dissertation, University of California, Berkeley"},{"issue":"2","key":"2983_CR5","doi-asserted-by":"publisher","first-page":"153","DOI":"10.1016\/0377-0427(89)90045-9","volume":"25","author":"AT Chronopoulos","year":"1989","unstructured":"Chronopoulos AT, Gear CW (1989) s-Step iterative methods for symmetric linear systems. J Comput Appl Math 25(2):153\u2013168. https:\/\/doi.org\/10.1016\/0377-0427(89)90045-9","journal-title":"J Comput Appl Math"},{"key":"2983_CR6","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1007\/978-3-642-85972-4_4","volume-title":"Lecture Notes in Economics and Mathematical Systems","author":"Paul Concus","year":"1976","unstructured":"Concus P, Golub GH (1976) A generalized conjugate gradient method for nonsymmetric systems of linear equations. In: Computing Methods in Applied Sciences and Engineering, Lecture Notes in Economics and Mathematical Systems, vol 134. Springer, pp 56\u201365. https:\/\/doi.org\/10.1007\/978-3-642-85972-4_4"},{"key":"2983_CR7","unstructured":"Cumming B (November 2018) STREAM benchmark in CUDA C++. https:\/\/github.com\/bcumming\/cuda-stream . Accessed 5"},{"issue":"1","key":"2983_CR8","doi-asserted-by":"publisher","first-page":"A206","DOI":"10.1137\/080731992","volume":"34","author":"J Demmel","year":"2012","unstructured":"Demmel J, Grigori L, Hoemmen M, Langou J (2012) Communication-optimal parallel and sequential QR and LU factorizations. SIAM J Sci Comput 34(1):A206\u2013A239. https:\/\/doi.org\/10.1137\/080731992","journal-title":"SIAM J Sci Comput"},{"issue":"2","key":"2983_CR9","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1137\/0720023","volume":"20","author":"SC Eisenstat","year":"1983","unstructured":"Eisenstat SC, Elman HC, Schultz MH (1983) Variational iterative methods for nonsymmetric systems of linear equations. SIAM J Numer Anal 20(2):345\u2013357. https:\/\/doi.org\/10.1137\/0720023","journal-title":"SIAM J Numer Anal"},{"key":"2983_CR10","doi-asserted-by":"publisher","unstructured":"Fujita N, Nuga H, Boku T, Idomura Y (2013) Nuclear fusion simulation code optimization on GPU clusters. In: Proceedings of the 19th IEEE International Conference on Parallel and Distributed Systems (ICPADS 2013). IEEE, pp 1266\u20131274. https:\/\/doi.org\/10.1109\/ICPADS.2013.65","DOI":"10.1109\/ICPADS.2013.65"},{"key":"2983_CR11","doi-asserted-by":"crossref","DOI":"10.56021\/9781421407944","volume-title":"Matrix computations","author":"GH Golub","year":"2013","unstructured":"Golub GH, Van Loan CF (2013) Matrix computations, 4th edn. The John Hopkins University Press, Baltimore","edition":"4"},{"key":"2983_CR12","unstructured":"Hoemmen M (2010) Communication-avoiding Krylov subspace methods. PhD dissertation, University of California, Berkeley"},{"issue":"6","key":"2983_CR13","doi-asserted-by":"publisher","first-page":"391","DOI":"10.1016\/j.cpc.2008.04.005","volume":"179","author":"Y Idomura","year":"2008","unstructured":"Idomura Y, Ida M, Kano T, Aiba N, Tokuda S (2008) Conservative global gyrokinetic toroidal full-f five-dimensional Vlasov simulation. Comput Phys Commun 179(6):391\u2013403. https:\/\/doi.org\/10.1016\/j.cpc.2008.04.005","journal-title":"Comput Phys Commun"},{"key":"2983_CR14","doi-asserted-by":"publisher","unstructured":"Idomura Y, Ina T, Mayumi A, Yamada S, Matsumoto K, Asahi Y, Imamura T (2017) Application of a communication-avoiding generalized minimal residual method to a gyrokinetic five dimensional Eulerian code on many core platforms. In: Proceedings of the 8th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA \u201917), p 7. https:\/\/doi.org\/10.1145\/3148226.3148234","DOI":"10.1145\/3148226.3148234"},{"issue":"1","key":"2983_CR15","doi-asserted-by":"publisher","first-page":"73","DOI":"10.1177\/1094342013490973","volume":"28","author":"Y Idomura","year":"2014","unstructured":"Idomura Y, Nakata M, Yamada S, Machida M, Imamura T, Watanabe T, Nunami M, Inoue H, Tsutsumi S, Miyoshi I, Shida N (2014) Communication-overlap techniques for improved strong scaling of gyrokinetic Eulerian code beyond 100k cores on the K-computer. Int J High Perform Comput Appl 28(1):73\u201386. https:\/\/doi.org\/10.1177\/1094342013490973","journal-title":"Int J High Perform Comput Appl"},{"issue":"1\u20134","key":"2983_CR16","doi-asserted-by":"publisher","first-page":"269","DOI":"10.1080\/00207169208804107","volume":"44","author":"WD Joubert","year":"1992","unstructured":"Joubert WD, Carey GF (1992) Parallelizable restarted iterative methods for nonsymmetric linear systems. Part I: theory. Int J Comput Math 44(1\u20134):269\u2013290. https:\/\/doi.org\/10.1080\/00207169208804107","journal-title":"Int J Comput Math"},{"key":"2983_CR17","unstructured":"McCalpin JD (November 2018) STREAM: Sustainable memory bandwidth in high performance computers. http:\/\/www.cs.virginia.edu\/stream\/ . Accessed 5"},{"key":"2983_CR18","doi-asserted-by":"publisher","unstructured":"Mohiyuddin M, Hoemmen M, Demmel J, Yelick K (2009) Minimizing communication in sparse matrix solvers. In: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis (SC \u201909). ACM. https:\/\/doi.org\/10.1145\/1654059.1654096","DOI":"10.1145\/1654059.1654096"},{"issue":"4","key":"2983_CR19","doi-asserted-by":"publisher","first-page":"511","DOI":"10.1177\/1094342010385729","volume":"24","author":"R Nath","year":"2010","unstructured":"Nath R, Tomov S, Dongarra J (2010) An improved MAGMA GEMM for Fermi graphics processing units. Int J High Perform Comput Appl 24(4):511\u2013515. https:\/\/doi.org\/10.1177\/1094342010385729","journal-title":"Int J High Perform Comput Appl"},{"key":"2983_CR20","unstructured":"NVIDIA Corporation: NVIDIA GPUDirect. https:\/\/developer.nvidia.com\/gpudirect . Accessed 5 Nov 2018"},{"key":"2983_CR21","unstructured":"Rosendale JV (1983) Minimizing inner product data dependencies in conjugate gradient iteration. Technical Report NASA-CR-17, NASA"},{"key":"2983_CR22","doi-asserted-by":"publisher","DOI":"10.1137\/1.9780898718003","volume-title":"Iterative methods for sparse linear systems","author":"Y Saad","year":"2003","unstructured":"Saad Y (2003) Iterative methods for sparse linear systems, 2nd edn. SIAM, Philadelphia","edition":"2"},{"issue":"3","key":"2983_CR23","doi-asserted-by":"publisher","first-page":"856","DOI":"10.1137\/0907058","volume":"7","author":"Y Saad","year":"1986","unstructured":"Saad Y, Schultz MH (1986) GMRES: a generalized minimal residual algorithm for solving nonsymmetric linear systems. SIAM J Sci Stat Comput 7(3):856\u2013869. https:\/\/doi.org\/10.1137\/0907058","journal-title":"SIAM J Sci Stat Comput"},{"key":"2983_CR24","doi-asserted-by":"publisher","unstructured":"Shimokawabe T, Aoki T, Muroi C, Ishida J, Kawano K, Endo T, Nukada A, Maruyama N, Matsuoka S (2010) An 80-fold speedup, 15.0 TFlops GPU acceleration of non-hydrostatic weather model ASUCA production code. In: Proceedings of the 2010 ACM\/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis (SC 2010). IEEE. https:\/\/doi.org\/10.1109\/SC.2010.9","DOI":"10.1109\/SC.2010.9"},{"issue":"6","key":"2983_CR25","doi-asserted-by":"publisher","first-page":"2165","DOI":"10.1137\/S1064827500370883","volume":"23","author":"A Stathopoulos","year":"2002","unstructured":"Stathopoulos A, Wu K (2002) A block orthogonalization procedure with constant synchronization requirements. SIAM J Sci Comput 23(6):2165\u20132184. https:\/\/doi.org\/10.1137\/S1064827500370883","journal-title":"SIAM J Sci Comput"},{"issue":"4","key":"2983_CR26","doi-asserted-by":"publisher","first-page":"441","DOI":"10.1016\/0168-9274(95)00079-A","volume":"18","author":"E de Sturler","year":"1995","unstructured":"de Sturler E, van der Vorst HA (1995) Reducing the effect of global communication in GMRES(m) and CG on parallel distributed memory computers. Appl Numer Math 18(4):441\u2013459. https:\/\/doi.org\/10.1016\/0168-9274(95)00079-A","journal-title":"Appl Numer Math"},{"issue":"1","key":"2983_CR27","doi-asserted-by":"publisher","first-page":"152","DOI":"10.1137\/0909010","volume":"9","author":"HF Walker","year":"1988","unstructured":"Walker HF (1988) Implementation of the GMRES method using householder transformations. SIAM J Sci Stat Comput 9(1):152\u2013163. https:\/\/doi.org\/10.1137\/0909010","journal-title":"SIAM J Sci Stat Comput"},{"key":"2983_CR28","first-page":"195","volume-title":"Performance tuning of scientific applications, chapter 9","author":"SW Williams","year":"2011","unstructured":"Williams SW (2011) The roofline model. In: Bailey DH, Lucas RF, Williams SW (eds) Performance tuning of scientific applications, chapter 9. CRC Press, Boca Raton, pp 195\u2013215"},{"key":"2983_CR29","doi-asserted-by":"publisher","unstructured":"Yamazaki I, Anzt H, Tomov S, Hoemmen M, Dongarra J (2014) Improving the performance of CA-GMRES on multicores with multiple GPUs. IEEE, pp 382\u2013391. https:\/\/doi.org\/10.1109\/IPDPS.2014.48","DOI":"10.1109\/IPDPS.2014.48"},{"key":"2983_CR30","doi-asserted-by":"publisher","unstructured":"Yamazaki I, Hoemmen M, Luszczek P, Dongarra J (2017) Improving performance of GMRES by reducing communication and pipelining global collectives. In: Proceedings of the 2017 IEEE 31st International Parallel and Distributed Processing Symposium Workshops (IPDPSW 2017). IEEE, pp 1118\u20131127. https:\/\/doi.org\/10.1109\/IPDPSW.2017.65","DOI":"10.1109\/IPDPSW.2017.65"},{"issue":"3","key":"2983_CR31","doi-asserted-by":"publisher","first-page":"C307","DOI":"10.1137\/14M0973773","volume":"37","author":"I Yamazaki","year":"2015","unstructured":"Yamazaki I, Tomov S, Dongarra J (2015) Mixed-precision Cholesky QR factorization and its case studies on multicore CPU with multiple GPUs. SIAM J Sci Comput 37(3):C307\u2013C330. https:\/\/doi.org\/10.1137\/14M0973773","journal-title":"SIAM J Sci Comput"},{"issue":"2","key":"2983_CR32","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2898347","volume":"43","author":"I Yamazaki","year":"2016","unstructured":"Yamazaki I, Tomov S, Dongarra JJ (2016) Stability and performance of various singular value QR implementations on multicore CPU with a GPU. ACM Trans Math Softw 43(2):1\u201318. https:\/\/doi.org\/10.1145\/2898347","journal-title":"ACM Trans Math Softw"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-019-02983-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-019-02983-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-019-02983-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,19]],"date-time":"2023-09-19T23:38:15Z","timestamp":1695166695000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-019-02983-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,9,5]]},"references-count":32,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2019,12]]}},"alternative-id":["2983"],"URL":"https:\/\/doi.org\/10.1007\/s11227-019-02983-7","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"type":"print","value":"0920-8542"},{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2019,9,5]]},"assertion":[{"value":"5 September 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}