{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,9]],"date-time":"2024-09-09T06:53:00Z","timestamp":1725864780899},"publisher-location":"Cham","reference-count":36,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319405261"},{"type":"electronic","value":"9783319405285"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-40528-5_14","type":"book-chapter","created":{"date-parts":[[2016,9,14]],"date-time":"2016-09-14T06:50:15Z","timestamp":1473835815000},"page":"317-338","source":"Crossref","is-referenced-by-count":5,"title":["Performance Engineering and Energy Efficiency of Building Blocks for Large, Sparse Eigenvalue Computations on Heterogeneous Supercomputers"],"prefix":"10.1007","author":[{"given":"Moritz","family":"Kreutzer","sequence":"first","affiliation":[]},{"given":"Jonas","family":"Thies","sequence":"additional","affiliation":[]},{"given":"Andreas","family":"Pieper","sequence":"additional","affiliation":[]},{"given":"Andreas","family":"Alvermann","sequence":"additional","affiliation":[]},{"given":"Martin","family":"Galgon","sequence":"additional","affiliation":[]},{"given":"Melven","family":"R\u00f6hrig-Z\u00f6llner","sequence":"additional","affiliation":[]},{"given":"Faisal","family":"Shahzad","sequence":"additional","affiliation":[]},{"given":"Achim","family":"Basermann","sequence":"additional","affiliation":[]},{"given":"Alan R.","family":"Bishop","sequence":"additional","affiliation":[]},{"given":"Holger","family":"Fehske","sequence":"additional","affiliation":[]},{"given":"Georg","family":"Hager","sequence":"additional","affiliation":[]},{"given":"Bruno","family":"Lang","sequence":"additional","affiliation":[]},{"given":"Gerhard","family":"Wellein","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,9,15]]},"reference":[{"key":"14_CR1","doi-asserted-by":"crossref","unstructured":"Ashari, A., Sedaghati, N., Eisenlohr, J., Parthasarathy, S., Sadayappan, P.: Fast sparse matrix-vector multiplication on GPUs for graph applications. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC \u201914), pp.\u00a0781\u2013792. IEEE Press, Piscataway (2014)","DOI":"10.1109\/SC.2014.69"},{"key":"14_CR2","doi-asserted-by":"crossref","unstructured":"Baker, C.G., Hetmaniuk, U.L., Lehoucq, R.B., Thornquist, H.K.: Anasazi software for the numerical solution of large-scale eigenvalue problems. ACM Trans. Math. Softw. 36 (3), 13:1\u201313:23 (2009)","DOI":"10.1145\/1527286.1527287"},{"key":"14_CR3","volume-title":"PETSc Web page","author":"S. Balay","year":"2015","unstructured":"Balay, S., Abhyankar, S., Adams, M.F., Brown, J., Brune, P., Buschelman, K., Dalcin, L., Eijkhout, V., Gropp, W.D., Kaushik, D., Knepley, M.G., McInnes, L.C., Rupp, K., Smith, B.F., Zampini, S., Zhang, H.: PETSc Web page (2015). http:\/\/www.mcs.anl.gov\/petsc"},{"issue":"4","key":"14_CR4","doi-asserted-by":"crossref","first-page":"334","DOI":"10.1016\/0743-7315(88)90002-0","volume":"5","author":"D. Callahan","year":"1988","unstructured":"Callahan, D., Cocke, J., Kennedy, K.: Estimating interlock and improving balance for pipelined architectures. J. Parallel Distrib. Commun. 5 (4), 334\u2013358 (1988)","journal-title":"J. Parallel Distrib. Commun."},{"key":"14_CR5","doi-asserted-by":"crossref","unstructured":"Daga, M., Greathouse, J.L.: Structural agnostic spmv: Adapting csr-adaptive for irregular matrices. In: 2015 IEEE 22nd International Conference on High Performance Computing (HiPC), pp.\u00a064\u201374 (2015)","DOI":"10.1109\/HiPC.2015.55"},{"key":"14_CR6","doi-asserted-by":"crossref","first-page":"793","DOI":"10.1007\/978-3-642-55224-3_74","volume-title":"Parallel Processing and Applied Mathematics. Lecture Notes in Computer Science","author":"K. Vogeleer De","year":"2014","unstructured":"De Vogeleer, K., Memmi, G., Jouvelot, P., Coelho, F.: The energy\/frequency convexity rule: modeling and experimental validation on mobile devices. In: Wyrzykowski, R., Dongarra, J., Karczewski, K., Wa\u015bniewski, J. (eds.) Parallel Processing and Applied Mathematics. Lecture Notes in Computer Science, vol.\u00a08384, pp.\u00a0793\u2013803. Springer, Berlin\/Heidelberg (2014)"},{"issue":"2","key":"14_CR7","doi-asserted-by":"crossref","first-page":"239","DOI":"10.1145\/567806.567810","volume":"28","author":"I.S. Duff","year":"2002","unstructured":"Duff, I.S., Heroux, M.A., Pozo, R.: An overview of the sparse basic linear algebra subprograms: the new standard from the BLAS technical forum. ACM Trans. Math. Softw. 28 (2), 239\u2013267 (2002)","journal-title":"ACM Trans. Math. Softw."},{"issue":"8","key":"14_CR8","doi-asserted-by":"crossref","first-page":"1868","DOI":"10.1002\/pssb.201552119","volume":"252","author":"H. Fehske","year":"2015","unstructured":"Fehske, H., Hager, G., Pieper, A.: Electron confinement in graphene with gate-defined quantum dots. Phys. Status Solidi 252 (8), 1868\u20131871 (2015)","journal-title":"Phys. Status Solidi"},{"key":"14_CR9","doi-asserted-by":"crossref","unstructured":"Greathouse, J.L., Daga, M.: Efficient sparse matrix-vector multiplication on GPUs using the CSR storage format. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp.\u00a0769\u2013780 (SC \u201914). IEEE Press, Piscataway (2014)","DOI":"10.1109\/SC.2014.68"},{"issue":"2","key":"14_CR10","doi-asserted-by":"crossref","first-page":"189","DOI":"10.1002\/cpe.3180","volume":"28","author":"G. Hager","year":"2014","unstructured":"Hager, G., Treibig, J., Habich, J., Wellein, G.: Exploring performance and power properties of modern multi-core chips via simple machine models. Concurr. Comput. 28 (2), 189\u2013210 (2014)","journal-title":"Concurr. Comput."},{"issue":"3","key":"14_CR11","doi-asserted-by":"crossref","first-page":"397","DOI":"10.1145\/1089014.1089021","volume":"31","author":"M.A. Heroux","year":"2005","unstructured":"Heroux, M.A., Bartlett, R.A., Howle, V.E., Hoekstra, R.J., Hu, J.J., Kolda, T.G., Lehoucq, R.B., Long, K.R., Pawlowski, R.P., Phipps, E.T., Salinger, A.G., Thornquist, H.K., Tuminaro, R.S., Willenbring, J.M., Williams, A., Stanley, K.S.: An overview of the Trilinos project. ACM Trans. Math. Softw. 31 (3), 397\u2013423 (2005)","journal-title":"ACM Trans. Math. Softw."},{"key":"14_CR12","doi-asserted-by":"crossref","unstructured":"Hockney, R.W., Curington, I.J.: f 1\u22152: A parameter to characterize memory and communication bottlenecks. Parallel Comput. 10 (3), 277\u2013286 (1989)","DOI":"10.1016\/0167-8191(89)90100-2"},{"issue":"5","key":"14_CR13","doi-asserted-by":"crossref","first-page":"C401","DOI":"10.1137\/130930352","volume":"36","author":"M. Kreutzer","year":"2014","unstructured":"Kreutzer, M., Hager, G., Wellein, G., Fehske, H., Bishop, A.R.: A unified sparse matrix data format for efficient general sparse matrix-vector multiplication on modern processors with wide SIMD units. SIAM J. Sci. Comput. 36 (5), C401\u2013C423 (2014)","journal-title":"SIAM J. Sci. Comput."},{"key":"14_CR14","unstructured":"Kreutzer, M., Pieper, A., Alvermann, A., Fehske, H., Hager, G., Wellein, G., Bishop, A.R.: Efficient large-scale sparse eigenvalue computations on heterogeneous hardware. In: Poster at 2015 ACM\/IEEE International Conference on High Performance Computing Networking, Storage and Analysis (SC \u201915) (2015)"},{"key":"14_CR15","doi-asserted-by":"crossref","unstructured":"Kreutzer, M., Pieper, A., Hager, G., Alvermann, A., Wellein, G., Fehske, H.: Performance engineering of the kernel polynomial method on large-scale CPU-GPU systems. In: 29th IEEE International Parallel & Distributed Processing Symposium (IEEE IPDPS 2015), Hyderabad (2015)","DOI":"10.1109\/IPDPS.2015.76"},{"key":"14_CR16","unstructured":"Kreutzer, M., Thies, J., R\u00f6hrig-Z\u00f6llner, M., Pieper, A., Shahzad, F., Galgon, M., Basermann, A., Fehske, H., Hager, G., Wellein, G.: GHOST: building blocks for high performance sparse linear algebra on heterogeneous systems (2015), preprint. http:\/\/arxiv.org\/abs\/1507.08101"},{"issue":"3","key":"14_CR17","doi-asserted-by":"crossref","first-page":"308","DOI":"10.1145\/355841.355847","volume":"5","author":"C.L. Lawson","year":"1979","unstructured":"Lawson, C.L., Hanson, R.J., Kincaid, D.R., Krogh, F.T.: Basic linear algebra subprograms for Fortran usage. ACM Trans. Math. Softw. 5 (3), 308\u2013323 (1979)","journal-title":"ACM Trans. Math. Softw."},{"key":"14_CR18","unstructured":"LIKWID: Performance monitoring and benchmarking suite. https:\/\/github.com\/RRZE-HPC\/likwid\/ . Accessed Feb 2016"},{"key":"14_CR19","doi-asserted-by":"crossref","unstructured":"Liu, W., Vinter, B.: CSR5: An efficient storage format for cross-platform sparse matrix-vector multiplication. In: Proceedings of the 29th ACM on International Conference on Supercomputing (ICS \u201915), pp.\u00a0339\u2013350. ACM, New York (2015)","DOI":"10.1145\/2751205.2751209"},{"key":"14_CR20","unstructured":"MAGMA: Matrix algebra on GPU and multicore architectures. http:\/\/icl.cs.utk.edu\/magma\/ . Accessed Feb 2016"},{"key":"14_CR21","doi-asserted-by":"crossref","first-page":"111","DOI":"10.1007\/978-3-642-11515-8_10","volume-title":"High Performance Embedded Architectures and Compilers. Lecture Notes in Computer Science","author":"A. Monakov","year":"2010","unstructured":"Monakov, A., Lokhmotov, A., Avetisyan, A.: Automatically tuning sparse matrix-vector multiplication for GPU architectures. In: Patt, Y., Foglia, P., Duesterwald, E., Faraboschi, P., Martorell, X. (eds.) High Performance Embedded Architectures and Compilers. Lecture Notes in Computer Science, vol.\u00a05952, pp.\u00a0111\u2013125. Springer, Berlin\/Heidelberg (2010)"},{"issue":"4","key":"14_CR22","doi-asserted-by":"crossref","first-page":"47010","DOI":"10.1209\/0295-5075\/104\/47010","volume":"104","author":"A. Pieper","year":"2013","unstructured":"Pieper, A., Heinisch, R.L., Fehske, H.: Electron dynamics in graphene with gate-defined quantum dots. EPL 104 (4), 47010 (2013)","journal-title":"EPL"},{"key":"14_CR23","doi-asserted-by":"crossref","first-page":"165121","DOI":"10.1103\/PhysRevB.89.165121","volume":"89","author":"A. Pieper","year":"2014","unstructured":"Pieper, A., Heinisch, R.L., Wellein, G., Fehske, H.: Dot-bound and dispersive states in graphene quantum dot superlattices. Phys. Rev. B 89, 165121 (2014)","journal-title":"Phys. Rev. B"},{"key":"14_CR24","unstructured":"Pieper, A., Kreutzer, M., Galgon, M., Alvermann, A., Fehske, H., Hager, G., Lang, B., Wellein, G.: High-performance implementation of Chebyshev filter diagonalization for interior eigenvalue computations (2015), preprint. http:\/\/arxiv.org\/abs\/1510.04895"},{"key":"14_CR25","doi-asserted-by":"crossref","first-page":"195409","DOI":"10.1103\/PhysRevB.88.195409","volume":"88","author":"A. Pieper","year":"2013","unstructured":"Pieper, A., Schubert, G., Wellein, G., Fehske, H.: Effects of disorder and contacts on transport through graphene nanoribbons. Phys. Rev. B 88, 195409 (2013)","journal-title":"Phys. Rev. B"},{"key":"14_CR26","doi-asserted-by":"crossref","first-page":"035123","DOI":"10.1103\/PhysRevB.93.035123","volume":"93","author":"A. Pieper","year":"2016","unstructured":"Pieper, A., Fehske, H.: Topological insulators in random potentials. Phys. Rev. B 93, 035123 (2016)","journal-title":"Phys. Rev. B"},{"key":"14_CR27","doi-asserted-by":"crossref","first-page":"115112","DOI":"10.1103\/PhysRevB.79.115112","volume":"79","author":"E. Polizzi","year":"2009","unstructured":"Polizzi, E.: Density-matrix-based algorithm for solving eigenvalue problems. Phys. Rev. B 79, 115112 (2009)","journal-title":"Phys. Rev. B"},{"key":"14_CR28","unstructured":"R\u00f6hrig-Z\u00f6llner, M., Thies, J., Kreutzer, M., Alvermann, A., Pieper, A., Basermann, A., Hager, G., Wellein, G., Fehske, H.: Performance of block Jacobi-Davidson eigensolvers. In: Poster at 2014 ACM\/IEEE International Conference on High Performance Computing Networking, Storage and Analysis (2014)"},{"issue":"6","key":"14_CR29","doi-asserted-by":"crossref","first-page":"C697","DOI":"10.1137\/140976017","volume":"37","author":"M. R\u00f6hrig-Z\u00f6llner","year":"2015","unstructured":"R\u00f6hrig-Z\u00f6llner, M., Thies, J., Kreutzer, M., Alvermann, A., Pieper, A., Basermann, A., Hager, G., Wellein, G., Fehske, H.: Increasing the performance of the Jacobi\u2013Davidson method by blocking. SIAM J. Sci. Comput. 37 (6), C697\u2013C722 (2015)","journal-title":"SIAM J. Sci. Comput."},{"key":"14_CR30","unstructured":"Rupp, K., Rudolf, F., Weinbub, J.: ViennaCL \u2013 a high level linear algebra library for GPUs and multi-core CPUs. In: International\u00a0Workshop on GPUs and Scientific Applications, pp.\u00a051\u201356 (2010)"},{"key":"14_CR31","doi-asserted-by":"crossref","unstructured":"Stengel, H., Treibig, J., Hager, G., Wellein, G.: Quantifying performance bottlenecks of stencil computations using the execution-cache-memory model. In: Proceedings of the 29th ACM International Conference on Supercomputing (ICS \u201915), pp.\u00a0207\u2013216. ACM, New York (2015)","DOI":"10.1145\/2751205.2751240"},{"key":"14_CR32","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-319-40528-5_13","volume-title":"Towards an exascale enabled sparse solver repository","author":"J. Thies","year":"2016","unstructured":"Thies, J., Galgon, M., Shahzad, F., Alvermann, A., Kreutzer, M., Pieper, A., R\u00f6hrig-Z\u00f6llner, M., Basermann, A., Fehske, H., Hager, G., Lang, B., Wellein, G.: Towards an exascale enabled sparse solver repository. In: Proceedings of SPPEXA Symposium. Lecture Notes in Computational Science and Engineering. Springer (2016)"},{"key":"14_CR33","unstructured":"TOP500 Supercomputer Sites. http:\/\/www.top500.org . Accessed Feb 2016"},{"key":"14_CR34","doi-asserted-by":"crossref","unstructured":"Treibig, J., Hager, G., Wellein, G.: LIKWID: A lightweight performance-oriented tool suite for x86 multicore environments. In: Proceedings of the 2010 39th International Conference on Parallel Processing Workshops (ICPPW \u201910), pp.\u00a0207\u2013216. IEEE Computer Society, Washington, DC (2010)","DOI":"10.1109\/ICPPW.2010.38"},{"key":"14_CR35","doi-asserted-by":"crossref","first-page":"27","DOI":"10.1007\/978-3-642-31476-6_3","volume-title":"Tools for High Performance Computing 2011","author":"J. Treibig","year":"2012","unstructured":"Treibig, J., Hager, G., Wellein, G.: likwid-bench: An extensible microbenchmarking platform for x86 multicore compute nodes. In: Brunst, H., M\u00fcller, M.S., Nagel, W.E., Resch, M.M. (eds.) Tools for High Performance Computing 2011, pp.\u00a027\u201336. Springer, Berlin\/Heidelberg (2012)"},{"key":"14_CR36","doi-asserted-by":"crossref","first-page":"275","DOI":"10.1103\/RevModPhys.78.275","volume":"78","author":"A. Wei\u00dfe","year":"2006","unstructured":"Wei\u00dfe, A., Wellein, G., Alvermann, A., Fehske, H.: The kernel polynomial method. Rev. Mod. Phys. 78, 275\u2013306 (2006)","journal-title":"Rev. Mod. Phys."}],"container-title":["Lecture Notes in Computational Science and Engineering","Software for Exascale Computing - SPPEXA 2013-2015"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-40528-5_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,9,13]],"date-time":"2019-09-13T13:12:47Z","timestamp":1568380367000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-40528-5_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319405261","9783319405285"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-40528-5_14","relation":{},"ISSN":["1439-7358","2197-7100"],"issn-type":[{"type":"print","value":"1439-7358"},{"type":"electronic","value":"2197-7100"}],"subject":[],"published":{"date-parts":[[2016]]}}}