{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,7]],"date-time":"2025-10-07T11:49:02Z","timestamp":1759837742953,"version":"3.37.3"},"reference-count":55,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2016,10,1]],"date-time":"2016-10-01T00:00:00Z","timestamp":1475280000000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001659","name":"Deutsche Forschungsgemeinschaft","doi-asserted-by":"publisher","award":["Priority Program 1648 \u201cSoftware for Exascale Computing\u201d, Project \u201cESSEX\u201d"],"award-info":[{"award-number":["Priority Program 1648 \u201cSoftware for Exascale Computing\u201d, Project \u201cESSEX\u201d"]}],"id":[{"id":"10.13039\/501100001659","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2017,10]]},"DOI":"10.1007\/s10766-016-0464-z","type":"journal-article","created":{"date-parts":[[2016,10,1]],"date-time":"2016-10-01T10:24:09Z","timestamp":1475317449000},"page":"1046-1072","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":21,"title":["GHOST: Building Blocks for High Performance Sparse Linear Algebra on Heterogeneous Systems"],"prefix":"10.1007","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7822-9468","authenticated-orcid":false,"given":"Moritz","family":"Kreutzer","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jonas","family":"Thies","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Melven","family":"R\u00f6hrig-Z\u00f6llner","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Andreas","family":"Pieper","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Faisal","family":"Shahzad","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Martin","family":"Galgon","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Achim","family":"Basermann","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Holger","family":"Fehske","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Georg","family":"Hager","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gerhard","family":"Wellein","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2016,10,1]]},"reference":[{"key":"464_CR1","doi-asserted-by":"publisher","unstructured":"Alvermann, A., Basermann, A., Fehske, H., Galgon, M., Hager, G., Kreutzer, M., Kr\u00e4mer, L., Lang, B., Pieper, A., R\u00f6hrig-Z\u00f6llner, M., Shahzad, F., Thies, J., Wellein, G.: ESSEX: Equipping Sparse Solvers for Exascale, pp. 577\u2013588. Springer International Publishing, Cham (2014). doi:\n                        10.1007\/978-3-319-14313-2_49","DOI":"10.1007\/978-3-319-14313-2_49"},{"key":"464_CR2","doi-asserted-by":"publisher","unstructured":"Anderson, M., Ballard, G., Demmel, J., Keutzer, K.: Communication-avoiding QR decomposition for GPUs. In: IEEE International on Parallel Distributed Processing Symposium (IPDPS), 2011, pp. 48\u201358 (2011). doi:\n                        10.1109\/IPDPS.2011.15","DOI":"10.1109\/IPDPS.2011.15"},{"issue":"2","key":"464_CR3","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1002\/cpe.1631","volume":"23","author":"C Augonnet","year":"2011","unstructured":"Augonnet, C., Thibault, S., Namyst, R., Wacrenier, P.A.: StarPU: a unified platform for task scheduling on heterogeneous multicore architectures. Concurr. Comput.: Pract. Exp. 23(2), 187\u2013198 (2011). doi:\n                        10.1002\/cpe.1631","journal-title":"Concurr. Comput.: Pract. Exp."},{"issue":"2","key":"464_CR4","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1155\/2012\/693861","volume":"20","author":"CG Baker","year":"2012","unstructured":"Baker, C.G., Heroux, M.A.: Tpetra, and the use of generic programming in scientific computing. Sci. Program. 20(2), 115\u2013128 (2012). doi:\n                        10.1155\/2012\/693861","journal-title":"Sci. Program."},{"key":"464_CR5","doi-asserted-by":"publisher","unstructured":"Baker, C.G., Hetmaniuk, U.L., Lehoucq, R.B., Thornquist, H.K.: Anasazi software for the numerical solution of large-scale eigenvalue problems. ACM Trans. Math. Softw. 36(3), 13:1\u201313:23 (2009). doi:\n                        10.1145\/1527286.1527287","DOI":"10.1145\/1527286.1527287"},{"key":"464_CR6","unstructured":"Balay, S., Abhyankar, S., Adams, M.F., Brown, J., Brune, P., Buschelman, K., Dalcin, L., Eijkhout, V., Gropp, W.D., Kaushik, D., Knepley, M.G., McInnes, L.C., Rupp, K., Smith, B.F., Zampini, S., Zhang, H.: PETSc Web page (2016). \n                        http:\/\/www.mcs.anl.gov\/petsc"},{"key":"464_CR7","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1145\/567806.567807","volume":"28","author":"LS Blackford","year":"2001","unstructured":"Blackford, L.S., Demmel, J., Dongarra, J., Duff, I., Hammarling, S., Henry, G., Heroux, M., Kaufman, L., Lumsdaine, A., Petitet, A., Pozo, R., Remington, K., Whaley, R.C.: An updated set of basic linear algebra subprograms (BLAS). ACM Trans. Math. Softw. 28, 135\u2013151 (2001). doi:\n                        10.1145\/567806.567807","journal-title":"ACM Trans. Math. Softw."},{"key":"464_CR8","doi-asserted-by":"crossref","unstructured":"Boisvert, R.F., Pozo, R., Remington, K., Barrett, R.F., Dongarra, J.J.: Matrix market: A web resource for test matrix collections. In: Proceedings of the IFIP TC2\/WG2.5 Working Conference on Quality of Numerical Software: Assessment and Enhancement, pp. 125\u2013137. Chapman & Hall, Ltd., London, UK (1997)","DOI":"10.1007\/978-1-5041-2940-4_9"},{"key":"464_CR9","doi-asserted-by":"publisher","unstructured":"Broquedis, F., Clet-Ortega, J., Moreaud, S., Furmento, N., Goglin, B., Mercier, G., Thibault, S., Namyst, R.: Hwloc: A generic framework for managing hardware affinities in HPC applications. In: Proceedings of the 2010 18th Euromicro Conference on Parallel, Distributed and Network-Based Processing, PDP \u201910, pp. 180\u2013186. IEEE Computer Society, Washington, DC, USA (2010). doi:\n                        10.1109\/PDP.2010.67","DOI":"10.1109\/PDP.2010.67"},{"issue":"6\u20138","key":"464_CR10","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1016\/j.parco.2007.12.001","volume":"34","author":"C Chevalier","year":"2008","unstructured":"Chevalier, C., Pellegrini, F.: PT-Scotch: a tool for efficient parallel graph ordering. Parallel Comput. 34(6\u20138), 318\u2013331 (2008). doi:\n                        10.1016\/j.parco.2007.12.001","journal-title":"Parallel Comput."},{"issue":"2","key":"464_CR11","doi-asserted-by":"publisher","first-page":"169","DOI":"10.1137\/140968896","volume":"37","author":"E Chow","year":"2015","unstructured":"Chow, E., Patel, A.: Fine-grained parallel incomplete factorization. SIAM J. Sci. Comput. 37(2), 169\u2013193 (2015). doi:\n                        10.1137\/140968896","journal-title":"SIAM J. Sci. Comput."},{"key":"464_CR12","doi-asserted-by":"publisher","unstructured":"Davis, T.A., Hu, Y.: The university of florida sparse matrix collection. ACM Trans. Math. Softw. 38(1), Art. No. 1 (2011). doi:\n                        10.1145\/2049662.2049663","DOI":"10.1145\/2049662.2049663"},{"key":"464_CR13","doi-asserted-by":"publisher","unstructured":"Demmel, J., Hoemmen, M., Mohiyuddin, M., Yelick, K.: Avoiding communication in sparse matrix computations. In: IEEE International Symposium on Parallel and Distributed Processing, 2008. IPDPS 2008, pp. 1\u201312 (2008). doi:\n                        10.1109\/IPDPS.2008.4536305","DOI":"10.1109\/IPDPS.2008.4536305"},{"key":"464_CR14","doi-asserted-by":"publisher","unstructured":"Denis, A.: POSTER: a generic framework for asynchronous progression and multithreaded communications. In: IEEE International Conference on Cluster Computing (CLUSTER), 2014, pp. 276\u2013277 (2014). doi:\n                        10.1109\/CLUSTER.2014.6968752","DOI":"10.1109\/CLUSTER.2014.6968752"},{"key":"464_CR15","doi-asserted-by":"publisher","unstructured":"Devine, K., Boman, E., Heaphy, R., Bisseling, R., Catalyurek, U.: Parallel hypergraph partitioning for scientific computing. In: Parallel and Distributed Processing Symposium, 2006. IPDPS 2006, 20th International, p. 10 (2006). doi:\n                        10.1109\/IPDPS.2006.1639359","DOI":"10.1109\/IPDPS.2006.1639359"},{"key":"464_CR16","doi-asserted-by":"publisher","first-page":"153","DOI":"10.1016\/j.parco.2015.06.005","volume":"49","author":"M Galgon","year":"2015","unstructured":"Galgon, M., Kr\u00e4mer, L., Thies, J., Basermann, A., Lang, B.: On the parallel iterative solution of linear systems arising in the FEAST algorithm for computing inner eigenvalues. Parallel Comput. 49, 153\u2013163 (2015). doi:\n                        10.1016\/j.parco.2015.06.005","journal-title":"Parallel Comput."},{"key":"464_CR17","doi-asserted-by":"publisher","unstructured":"Gebremedhin, A.H., Nguyen, D., Patwary, M.M.A., Pothen, A.: Colpack: software for graph coloring and related problems in scientific computing. ACM Trans. Math. Softw. 40(1), 1:1\u20131:31 (2013). doi:\n                        10.1145\/2513109.2513110","DOI":"10.1145\/2513109.2513110"},{"key":"464_CR18","unstructured":"GHOST: General, Hybrid, and Optimized Sparse Toolkit. \n                        https:\/\/bitbucket.org\/essex\/ghost\n                        \n                    . Accessed July 2016"},{"issue":"1","key":"464_CR19","doi-asserted-by":"publisher","first-page":"C48","DOI":"10.1137\/12086563X","volume":"35","author":"P Ghysels","year":"2013","unstructured":"Ghysels, P., Ashby, T.J., Meerbergen, K., Vanroose, W.: Hiding global communication latency in the GMRES algorithm on massively parallel machines. SIAM J. Sci. Comput. 35(1), C48\u2013C71 (2013). doi:\n                        10.1137\/12086563X","journal-title":"SIAM J. Sci. Comput."},{"key":"464_CR20","unstructured":"Gropp, W.D., Kaushik, D.K., Keyes, D.E., Smith, B.F.: Towards realistic performance bounds for implicit CFD codes. In: Proceedings of Parallel CFD99, pp. 233\u2013240. Elsevier (1999)"},{"key":"464_CR21","doi-asserted-by":"crossref","DOI":"10.1201\/EBK1439811924","volume-title":"Introduction to High Performance Computing for Scientists and Engineers","author":"G Hager","year":"2010","unstructured":"Hager, G., Wellein, G.: Introduction to High Performance Computing for Scientists and Engineers, 1st edn. CRC Press Inc, Boca Raton, FL (2010)","edition":"1"},{"key":"464_CR22","unstructured":"Intel Math Kernel Library. \n                        https:\/\/software.intel.com\/en-us\/intel-mkl\n                        \n                    . Accessed July 2016"},{"key":"464_CR23","first-page":"355","volume":"35","author":"S Kaczmarz","year":"1937","unstructured":"Kaczmarz, S.: Angen\u00e4herte aufl\u00f6sung von systemen linearer gleichungen. Bull. Int. Acad. Pol. Sci. Lett. 35, 355\u2013357 (1937)","journal-title":"Bull. Int. Acad. Pol. Sci. Lett."},{"issue":"5","key":"464_CR24","doi-asserted-by":"publisher","first-page":"C401","DOI":"10.1137\/130930352","volume":"36","author":"M Kreutzer","year":"2014","unstructured":"Kreutzer, M., Hager, G., Wellein, G., Fehske, H., Bishop, A.R.: A unified sparse matrix data format for efficient general sparse matrix-vector multiplication on modern processors with wide SIMD units. SIAM J. Sci. Comput. 36(5), C401\u2013C423 (2014). doi:\n                        10.1137\/130930352","journal-title":"SIAM J. Sci. Comput."},{"key":"464_CR25","doi-asserted-by":"publisher","unstructured":"Kreutzer, M., Pieper, A., Hager, G., Wellein, G., Alvermann, A., Fehske, H.: Performance engineering of the kernel polynomal method on large-scale cpu-gpu systems. In: Parallel and Distributed Processing Symposium (IPDPS), 2015 IEEE International, pp. 417\u2013426 (2015). doi:\n                        10.1109\/IPDPS.2015.76","DOI":"10.1109\/IPDPS.2015.76"},{"key":"464_CR26","unstructured":"LAMA: Library for accelerated mathematical applications. \n                        http:\/\/www.libama.org\n                        \n                    . Accessed July 2016"},{"key":"464_CR27","doi-asserted-by":"publisher","DOI":"10.1137\/1.9780898719628","author":"R Lehoucq","year":"1998","unstructured":"Lehoucq, R., Sorensen, D., Yang, C.: ARPACK users\u2019 guide. Soc. Ind. Appl. Math. (1998). doi:\n                        10.1137\/1.9780898719628","journal-title":"Soc. Ind. Appl. Math."},{"key":"464_CR28","unstructured":"MAGMA: Matrix algebra on GPU and multicore architectures. \n                        http:\/\/icl.cs.utk.edu\/magma\/\n                        \n                    . Accessed July 2016"},{"key":"464_CR29","unstructured":"Matrix Market Exchange Format. \n                        http:\/\/math.nist.gov\/MatrixMarket\/formats.html#MMformat\n                        \n                    . Accessed July 2016"},{"key":"464_CR30","unstructured":"McCalpin, J.D.: Memory bandwidth and machine balance in current high performance computers. IEEE Computer Society Technical Committee on Computer Architecture (TCCA) Newsletter pp. 19\u201325 (1995)"},{"key":"464_CR31","doi-asserted-by":"publisher","unstructured":"Monakov, A., Lokhmotov, A., Avetisyan, A.: Automatically tuning sparse matrix-vector multiplication for GPU architectures. In: Y.\u00a0Patt, P.\u00a0Foglia, E.\u00a0Duesterwald, P.\u00a0Faraboschi, X.\u00a0Martorell (eds.) High Performance Embedded Architectures and Compilers, Lecture Notes in Computer Science, vol. 5952, pp. 111\u2013125. Springer, Berlin (2010). doi:\n                        10.1007\/978-3-642-11515-8_10","DOI":"10.1007\/978-3-642-11515-8_10"},{"key":"464_CR32","doi-asserted-by":"publisher","unstructured":"Nelson, T., Belter, G., Siek, J.G., Jessup, E., Norris, B.: Reliable generation of high-performance matrix algebra. ACM Trans. Math. Softw. 41(3), 18:1\u201318:27 (2015). doi:\n                        10.1145\/2629698","DOI":"10.1145\/2629698"},{"key":"464_CR33","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/0024-3795(80)90247-5","volume":"29","author":"DP O\u2019Leary","year":"1980","unstructured":"O\u2019Leary, D.P.: The block conjugate gradient algorithm and related methods. Linear Algebra Appl. 29, 293\u2013322 (1980). doi:\n                        10.1016\/0024-3795(80)90247-5\n                        \n                    . (Special Volume Dedicated to Alson S. Householder)","journal-title":"Linear Algebra Appl."},{"issue":"1","key":"464_CR34","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1002\/cnm.1630030106","volume":"3","author":"TC Oppe","year":"1987","unstructured":"Oppe, T.C., Kincaid, D.R.: The performance of ITPACK on vector computers for solving large sparse linear systems arising in sample oil reseervoir simulation problems. Commun. Appl. Numer. Methods 3(1), 23\u201329 (1987). doi:\n                        10.1002\/cnm.1630030106","journal-title":"Commun. Appl. Numer. Methods"},{"key":"464_CR35","unstructured":"PARALUTION. \n                        http:\/\/www.paralution.com\n                        \n                    . Accessed July 2016"},{"key":"464_CR36","unstructured":"PHIST: Pipelined Hybrid-parallel Iterative Solver Toolkit. \n                        https:\/\/bitbucket.org\/essex\/phist\n                        \n                    . Accessed July 2016"},{"key":"464_CR37","doi-asserted-by":"publisher","first-page":"165121","DOI":"10.1103\/PhysRevB.89.165121","volume":"89","author":"A Pieper","year":"2014","unstructured":"Pieper, A., Heinisch, R.L., Wellein, G., Fehske, H.: Dot-bound and dispersive states in graphene quantum dot superlattices. Phys. Rev. B 89, 165121 (2014). doi:\n                        10.1103\/PhysRevB.89.165121","journal-title":"Phys. Rev. B"},{"key":"464_CR38","doi-asserted-by":"publisher","first-page":"226","DOI":"10.1016\/j.jcp.2016.08.027","volume":"325","author":"A Pieper","year":"2016","unstructured":"Pieper, A., Kreutzer, M., Alvermann, A., Galgon, M., Fehske, H., Hager, G., Lang, B., Wellein, G.: High-performance implementation of Chebyshev filter diagonalization for interior eigenvalue computations. J. Comput. Phys. 325, 226\u2013243 (2016). doi:\n                        10.1016\/j.jcp.2016.08.027","journal-title":"J. Comput. Phys."},{"key":"464_CR39","doi-asserted-by":"publisher","first-page":"115112","DOI":"10.1103\/PhysRevB.79.115112","volume":"79","author":"E Polizzi","year":"2009","unstructured":"Polizzi, E.: Density-matrix-based algorithm for solving eigenvalue problems. Phys. Rev. B 79, 115112 (2009). doi:\n                        10.1103\/PhysRevB.79.115112","journal-title":"Phys. Rev. B"},{"key":"464_CR40","doi-asserted-by":"publisher","unstructured":"Rabenseifner, R., Hager, G., Jost, G.: Hybrid mpi\/openmp parallel programming on clusters of multi-core smp nodes. In: 17th Euromicro International Conference Parallel, Distributed and Network-based Processing, 2009, pp. 427\u2013436 (2009). doi:\n                        10.1109\/PDP.2009.43","DOI":"10.1109\/PDP.2009.43"},{"issue":"6","key":"464_CR41","doi-asserted-by":"publisher","first-page":"C697","DOI":"10.1137\/140976017","volume":"37","author":"M R\u00f6hrig-Z\u00f6llner","year":"2015","unstructured":"R\u00f6hrig-Z\u00f6llner, M., Thies, J., Kreutzer, M., Alvermann, A., Pieper, A., Basermann, A., Hager, G., Wellein, G., Fehske, H.: Increasing the performance of the Jacobi\u2013Davidson method by blocking. SIAM J. Sci. Comput. 37(6), C697\u2013C722 (2015). doi:\n                        10.1137\/140976017","journal-title":"SIAM J. Sci. Comput."},{"key":"464_CR42","unstructured":"Rupp, K., Rudolf, F., Weinbub, J.: ViennaCL\u2013A High Level Linear Algebra Library for GPUs and Multi-Core CPUs. In: International Workshop on GPUs and Scientific Applications, pp. 51\u201356 (2010)"},{"key":"464_CR43","doi-asserted-by":"publisher","unstructured":"Rupp, K., Weinbub, J., J\u00fcngel, A., Grasser, T.: Pipelined iterative solvers with kernel fusion for graphics processing units. ACM Trans. Math. Softw. 43(2), 11:1\u201311:27 (2016). doi:\n                        10.1145\/2907944","DOI":"10.1145\/2907944"},{"issue":"3","key":"464_CR44","doi-asserted-by":"publisher","first-page":"497","DOI":"10.1016\/j.cpc.2011.11.005","volume":"183","author":"G Schofield","year":"2012","unstructured":"Schofield, G., Chelikowsky, J.R., Saad, Y.: A spectrum slicing method for the Kohn\u2013Sham problem. Comput. Phys. Commun. 183(3), 497\u2013505 (2012). doi:\n                        10.1016\/j.cpc.2011.11.005","journal-title":"Comput. Phys. Commun."},{"key":"464_CR45","doi-asserted-by":"publisher","first-page":"201105","DOI":"10.1103\/PhysRevB.85.201105","volume":"85","author":"G Schubert","year":"2012","unstructured":"Schubert, G., Fehske, H., Fritz, L., Vojta, M.: Fate of topological-insulator surface states under strong disorder. Phys. Rev. B 85, 201105 (2012). doi:\n                        10.1103\/PhysRevB.85.201105","journal-title":"Phys. Rev. B"},{"key":"464_CR46","doi-asserted-by":"publisher","unstructured":"Siek, J., Karlin, I., Jessup, E.: Build to order linear algebra kernels. In: IEEE International Symposium on Parallel and Distributed Processing, 2008. IPDPS 2008, pp. 1\u20138 (2008). doi:\n                        10.1109\/IPDPS.2008.4536183","DOI":"10.1109\/IPDPS.2008.4536183"},{"key":"464_CR47","unstructured":"SpMP: Sparse matrix pre-processing library. \n                        https:\/\/github.com\/IntelLabs\/SpMP\n                        \n                    . Accessed July 2016"},{"issue":"2","key":"464_CR48","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1731022.1731031","volume":"37","author":"A Stathopoulos","year":"2010","unstructured":"Stathopoulos, A., McCombs, J.R.: PRIMME: preconditioned iterative multimethod eigensolver-methods and software description. ACM Trans. Math. Softw. 37(2), 1\u201330 (2010). doi:\n                        10.1145\/1731022.1731031","journal-title":"ACM Trans. Math. Softw."},{"issue":"3","key":"464_CR49","doi-asserted-by":"publisher","first-page":"601","DOI":"10.1137\/S0895479800371529","volume":"23","author":"GW Stewart","year":"2002","unstructured":"Stewart, G.W.: A Krylov-Schur algorithm for large eigenproblems. SIAM J. Matrix Anal. Appl. 23(3), 601\u2013614 (2002). doi:\n                        10.1137\/S0895479800371529","journal-title":"SIAM J. Matrix Anal. Appl."},{"issue":"2","key":"464_CR50","doi-asserted-by":"publisher","first-page":"577","DOI":"10.1007\/s11227-014-1102-4","volume":"70","author":"S Tabik","year":"2014","unstructured":"Tabik, S., Ortega, G., Garzn, E.: Performance evaluation of kernel fusion BLAS routines on the GPU: iterative solvers as case study. J. Supercomput. 70(2), 577\u2013587 (2014). doi:\n                        10.1007\/s11227-014-1102-4","journal-title":"J. Supercomput."},{"key":"464_CR51","unstructured":"TOP500 Supercomputer Sites as of June 2016. \n                        http:\/\/www.top500.org\n                        \n                    . Accessed July 2016"},{"key":"464_CR52","unstructured":"Vital, B.: Etude de quelques mthodes de rsolution de problmes linaires de grande taille sur multiprocessor. Ph.D. thesis, Universit de Rennes, Rennes (1990)"},{"key":"464_CR53","doi-asserted-by":"publisher","unstructured":"Wahib, M., Maruyama, N.: Scalable kernel fusion for memory-bound GPU applications. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC \u201914, pp. 191\u2013202. IEEE Press, Piscataway, NJ, USA (2014). doi:\n                        10.1109\/SC.2014.21","DOI":"10.1109\/SC.2014.21"},{"issue":"4","key":"464_CR54","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1145\/1498765.1498785","volume":"52","author":"S Williams","year":"2009","unstructured":"Williams, S., Waterman, A., Patterson, D.: Roofline: an insightful visual performance model for multicore architectures. Commun. ACM 52(4), 65\u201376 (2009). doi:\n                        10.1145\/1498765.1498785","journal-title":"Commun. ACM"},{"key":"464_CR55","unstructured":"Wittmann, M., Hager, G., Zeiser, T., Wellein, G.: Asynchronous MPI for the masses (2013). \n                        http:\/\/arxiv.org\/abs\/1302.4280\n                        \n                    . Preprint"}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10766-016-0464-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-016-0464-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-016-0464-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2017,8,6]],"date-time":"2017-08-06T22:33:35Z","timestamp":1502058815000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10766-016-0464-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,10,1]]},"references-count":55,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2017,10]]}},"alternative-id":["464"],"URL":"https:\/\/doi.org\/10.1007\/s10766-016-0464-z","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"type":"print","value":"0885-7458"},{"type":"electronic","value":"1573-7640"}],"subject":[],"published":{"date-parts":[[2016,10,1]]}}}