{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T20:12:39Z","timestamp":1767989559105,"version":"3.49.0"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319969824","type":"print"},{"value":"9783319969831","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-319-96983-1_57","type":"book-chapter","created":{"date-parts":[[2018,7,31]],"date-time":"2018-07-31T15:50:06Z","timestamp":1533052206000},"page":"811-825","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Tile Low-Rank GEMM Using Batched Operations on GPUs"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9509-7794","authenticated-orcid":false,"given":"Ali","family":"Charara","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4052-7224","authenticated-orcid":false,"given":"David","family":"Keyes","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6897-1095","authenticated-orcid":false,"given":"Hatem","family":"Ltaief","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,8,1]]},"reference":[{"key":"57_CR1","unstructured":"Matrix Algebra on GPU and Multicore Architectures. Innovative Computing Laboratory, University of Tennessee. http:\/\/icl.cs.utk.edu\/magma\/"},{"key":"57_CR2","unstructured":"The NVIDIA CUDA Basic Linear Algebra Subroutines (CUBLAS). http:\/\/developer.nvidia.com\/cublas"},{"key":"57_CR3","unstructured":"Abadi, M., Agarwal, A., Barham, P., Brevdo, E., et al.: TensorFlow: large-scale machine learning on heterogeneous distributed systems. arXiv preprint arXiv:1603.04467 (2016)"},{"key":"57_CR4","doi-asserted-by":"publisher","first-page":"108","DOI":"10.1016\/j.procs.2016.05.302","volume":"80","author":"A Abdelfattah","year":"2016","unstructured":"Abdelfattah, A., et al.: High-performance tensor contractions for GPUs. Procedia Comput. Sci. 80, 108\u2013118 (2016). International Conference on Computational Science 2016, ICCS 2016, San Diego, California, USA, 6\u20138 June 2016","journal-title":"Procedia Comput. Sci."},{"key":"57_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/978-3-319-41321-1_2","volume-title":"High Performance Computing","author":"A Abdelfattah","year":"2016","unstructured":"Abdelfattah, A., Haidar, A., Tomov, S., Dongarra, J.: Performance, design, and autotuning of batched GEMM for GPUs. In: Kunkel, J.M., Balaji, P., Dongarra, J. (eds.) ISC High Performance 2016. LNCS, vol. 9697, pp. 21\u201338. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-41321-1_2"},{"issue":"12","key":"57_CR6","doi-asserted-by":"publisher","first-page":"3447","DOI":"10.1002\/cpe.3874","volume":"28","author":"A Abdelfattah","year":"2016","unstructured":"Abdelfattah, A., Ltaief, H., Keyes, D.E., Dongarra, J.J.: Performance optimization of sparse matrix-vector multiplication for multi-component PDE-based applications using GPUs. Concurr. Comput.: Pract. Exp. 28(12), 3447\u20133465 (2016)","journal-title":"Concurr. Comput.: Pract. Exp."},{"issue":"1","key":"57_CR7","first-page":"012037","volume":"180","author":"E Agullo","year":"2009","unstructured":"Agullo, E., et al.: Numerical linear algebra on emerging architectures: the PLASMA and MAGMA projects. J. Phys: Conf. Ser. 180(1), 012037 (2009)","journal-title":"J. Phys: Conf. Ser."},{"key":"57_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"22","DOI":"10.1007\/978-3-319-58667-0_2","volume-title":"High Performance Computing","author":"K Akbudak","year":"2017","unstructured":"Akbudak, K., Ltaief, H., Mikhalev, A., Keyes, D.: Tile low rank cholesky factorization for climate\/weather modeling applications on manycore architectures. In: Kunkel, J.M., Yokota, R., Balaji, P., Keyes, D. (eds.) ISC 2017. LNCS, vol. 10266, pp. 22\u201340. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-58667-0_2"},{"key":"57_CR9","doi-asserted-by":"publisher","first-page":"721","DOI":"10.1007\/978-3-319-96983-1_51","volume-title":"Euro-Par 2018: Parallel Processing","author":"Kadir Akbudak","year":"2018","unstructured":"Akbudak, K., Ltaief, H., Mikhalev, A., Charara, A., Keyes, D.: Exploiting data sparsity for large-scale matrix computations. In: Aldinucci, M., et al. (eds.) Euro-Par 2018. LNCS, vol. 11014, pp. xx\u2013yy. Springer, Cham (2018)"},{"issue":"3","key":"57_CR10","doi-asserted-by":"publisher","first-page":"477","DOI":"10.1007\/s10915-013-9714-z","volume":"57","author":"S Ambikasaran","year":"2013","unstructured":"Ambikasaran, S., Darve, E.: An $$\\mathscr {O}({N} \\log {N})$$ fast direct solver for partial hierarchically semiseparable matrices. J. Sci. Comput. 57(3), 477\u2013501 (2013)","journal-title":"J. Sci. Comput."},{"issue":"3","key":"57_CR11","doi-asserted-by":"publisher","first-page":"A1451","DOI":"10.1137\/120903476","volume":"37","author":"PR Amestoy","year":"2015","unstructured":"Amestoy, P.R., Ashcraft, C., Boiteau, O., Buttari, A., L\u2019Excellent, J.Y., Weisbecker, C.: Improving multifrontal methods by means of block low-rank representations. SIAM J. Sci. Comput. 37(3), A1451\u2013A1474 (2015). https:\/\/doi.org\/10.1137\/120903476","journal-title":"SIAM J. Sci. Comput."},{"key":"57_CR12","unstructured":"Aminfar, A., Darve, E.: A fast sparse solver for Finite-Element matrices. arXiv:1403.5337 [cs.NA], pp. 1\u201325 (2014)"},{"key":"57_CR13","doi-asserted-by":"crossref","unstructured":"B\u00f6rm, S.: Efficient numerical methods for non-local operators: $$\\mathscr {H}^2$$-Matrix compression, algorithms and analysis. EMS Tracts in Mathematics, vol. 14. European Mathematical Society (2010)","DOI":"10.4171\/091"},{"key":"57_CR14","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1016\/j.parco.2017.09.001","volume":"74","author":"WH Boukaram","year":"2017","unstructured":"Boukaram, W.H., Turkiyyah, G., Ltaief, H., Keyes, D.E.: Batched QR and SVD algorithms on GPUs with applications in hierarchical matrix compression. Parallel Comput. 74, 19\u201333 (2017)","journal-title":"Parallel Comput."},{"key":"57_CR15","unstructured":"Charara, A., Keyes, D., Ltaief, H.: Batched triangular dense linear algebra kernels for very small matrix sizes on GPUs. ACM Trans. Math. Softw. (2017, submitted). (under review, http:\/\/hdl.handle.net\/10754\/622975)"},{"key":"57_CR16","doi-asserted-by":"publisher","unstructured":"Charara, A., Keyes, D., Ltaief, H.: Software artifact for Euro-Par 2018: Tile Low-Rank GEMM Using Batched Operations on GPUs. figshare. Code. (2018). https:\/\/doi.org\/10.6084\/m9.figshare.6387623","DOI":"10.6084\/m9.figshare.6387623"},{"key":"57_CR17","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1016\/j.parco.2017.12.001","volume":"74","author":"G Ch\u00e1vez","year":"2017","unstructured":"Ch\u00e1vez, G., Turkiyyah, G., Zampini, S., Ltaief, H., Keyes, D.: Accelerated cyclic reduction: a distributed-memory fast solver for structured linear systems. Parallel Comput. 74, 65\u201383 (2017)","journal-title":"Parallel Comput."},{"key":"57_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/42288.42291","volume":"14","author":"J Dongarra","year":"1988","unstructured":"Dongarra, J., Du Croz, J., Hammarling, S., Hanson, R.J.: An extended set of Fortran basic linear algebra subprograms. ACM Trans. Math. Softw. 14, 1\u201317 (1988)","journal-title":"ACM Trans. Math. Softw."},{"key":"57_CR19","unstructured":"Dongarra, J., et al.: A proposed API for batched basic linear algebra subprograms. Mims preprint, University of Manchester (2016). http:\/\/eprints.maths.manchester.ac.uk\/id\/eprint\/2464"},{"issue":"4","key":"57_CR20","doi-asserted-by":"publisher","first-page":"295","DOI":"10.1007\/s00607-003-0019-1","volume":"70","author":"L Grasedyck","year":"2003","unstructured":"Grasedyck, L., Hackbusch, W.: Construction and arithmetics of $$\\mathscr {H}$$-matrices. Computing 70(4), 295\u2013334 (2003). https:\/\/doi.org\/10.1007\/s00607-003-0019-1","journal-title":"Computing"},{"issue":"2","key":"57_CR21","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1007\/s006070050015","volume":"62","author":"W Hackbusch","year":"1999","unstructured":"Hackbusch, W.: A sparse matrix arithmetic based on $$\\mathscr {H}$$-matrices. part i: introduction to $$\\mathscr {H}$$-matrices. Computing 62(2), 89\u2013108 (1999). https:\/\/doi.org\/10.1007\/s006070050015","journal-title":"Computing"},{"key":"57_CR22","series-title":"Springer Series in Computational Mathematics","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-47324-5","volume-title":"Hierarchical Matrices: Algorithms and Analysis","author":"W Hackbusch","year":"2015","unstructured":"Hackbusch, W.: Hierarchical Matrices: Algorithms and Analysis. Springer Series in Computational Mathematics, vol. 49. Springer, Heidelberg (2015). https:\/\/doi.org\/10.1007\/978-3-662-47324-5"},{"issue":"1","key":"57_CR23","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s00607-002-1450-4","volume":"69","author":"W Hackbusch","year":"2002","unstructured":"Hackbusch, W., B\u00f6rm, S.: Data-sparse approximation by adaptive $${\\mathscr {H}}^2$$-matrices. Computing 69(1), 1\u201335 (2002). https:\/\/doi.org\/10.1007\/s00607-002-1450-4","journal-title":"Computing"},{"key":"57_CR24","unstructured":"Hackbusch, W., B\u00f6rm, S., Grasedyck, L.: HLib 1.4. Max-Planck-Institut, Leipzig (2012)"},{"key":"57_CR25","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1007\/978-3-642-59709-1_2","volume-title":"Lectures on Applied Mathematics","author":"W Hackbusch","year":"2000","unstructured":"Hackbusch, W., Khoromskij, B., Sauter, S.: On $$\\mathscr {H}^2$$-matrices. In: Bungartz, H.J., Hoppe, R.H.W., Zenger, C. (eds.) Lectures on Applied Mathematics, pp. 9\u201329. Springer, Heidelberg (2000). https:\/\/doi.org\/10.1007\/978-3-642-59709-1_2"},{"issue":"2","key":"57_CR26","doi-asserted-by":"publisher","first-page":"217","DOI":"10.1137\/090771806","volume":"53","author":"N Halko","year":"2011","unstructured":"Halko, N., Martinsson, P.G., Tropp, J.A.: Finding structure with randomness: probabilistic algorithms for constructing approximate matrix decompositions. SIAM Rev. 53(2), 217\u2013288 (2011). https:\/\/doi.org\/10.1137\/090771806","journal-title":"SIAM Rev."},{"key":"57_CR27","doi-asserted-by":"crossref","unstructured":"Heinecke, A., Henry, G., Hutchinson, M., Pabst, H.: LIBXSMM: accelerating small matrix multiplications by runtime code generation. In: 0001, J.W., Pancake, C.M. (eds.) Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2016, Salt Lake City, UT, USA, 13\u201318 November 2016, p. 84. ACM (2016)","DOI":"10.1109\/SC.2016.83"},{"key":"57_CR28","doi-asserted-by":"publisher","unstructured":"Kim, K., et al.: Designing vector-friendly compact BLAS and LAPACK kernels. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2017, pp. 55:1\u201355:12. ACM, New York (2017). https:\/\/doi.org\/10.1145\/3126908.3126941","DOI":"10.1145\/3126908.3126941"},{"issue":"3","key":"57_CR29","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1007\/s00791-014-0226-7","volume":"16","author":"R Kriemann","year":"2013","unstructured":"Kriemann, R.: LU factorization on many-core systems. Comput. Vis. Sci. 16(3), 105\u2013117 (2013). https:\/\/doi.org\/10.1007\/s00791-014-0226-7","journal-title":"Comput. Vis. Sci."},{"key":"57_CR30","doi-asserted-by":"crossref","unstructured":"Ltaief, H., et al.: Real-time massively distributed multi-object adaptive optics simulations for the european extremely large telescope. In: 2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS), accepted, May 2018","DOI":"10.1109\/IPDPS.2018.00018"},{"key":"57_CR31","doi-asserted-by":"publisher","first-page":"5850","DOI":"10.1175\/2011JCLI4199.1","volume":"24","author":"GR North","year":"2011","unstructured":"North, G.R., Wang, J., Genton, M.G.: Correlation models for temperature fields. J. Clim. 24, 5850\u20135862 (2011)","journal-title":"J. Clim."},{"issue":"4","key":"57_CR32","doi-asserted-by":"publisher","first-page":"27:1","DOI":"10.1145\/2930660","volume":"42","author":"FH Rouet","year":"2016","unstructured":"Rouet, F.H., Li, X.S., Ghysels, P., Napov, A.: A distributed-memory package for dense hierarchically semi-separable matrix computations using randomization. ACM Trans. Math. Softw. 42(4), 27:1\u201327:35 (2016)","journal-title":"ACM Trans. Math. Softw."},{"key":"57_CR33","doi-asserted-by":"crossref","unstructured":"Shi, Y., Niranjan, U.N., Anandkumar, A., Cecka, C.: Tensor contractions with extended BLAS kernels on CPU and GPU. In: HiPC, pp. 193\u2013202. IEEE Computer Society (2016)","DOI":"10.1109\/HiPC.2016.031"},{"issue":"1","key":"57_CR34","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1007\/BF02575706","volume":"33","author":"E Tyrtyshnikov","year":"1996","unstructured":"Tyrtyshnikov, E.: Mosaic-skeleton approximations. Calcolo 33(1), 47\u201357 (1996)","journal-title":"Calcolo"}],"container-title":["Lecture Notes in Computer Science","Euro-Par 2018: Parallel Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-96983-1_57","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,1]],"date-time":"2022-08-01T01:10:39Z","timestamp":1659316239000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-319-96983-1_57"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783319969824","9783319969831"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-96983-1_57","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"1 August 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"Euro-Par","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Parallel Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Turin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 August 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 August 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"europar2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/europar2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}