{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T09:41:26Z","timestamp":1763458886571,"version":"3.45.0"},"publisher-location":"New York, NY, USA","reference-count":30,"publisher":"ACM","license":[{"start":{"date-parts":[[2016,2,7]],"date-time":"2016-02-07T00:00:00Z","timestamp":1454803200000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["ACI-1339822"],"award-info":[{"award-number":["ACI-1339822"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000015","name":"U.S. Department of Energy","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100007065","name":"Nvidia","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100007065","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2015,2,7]]},"DOI":"10.1145\/2716282.2716288","type":"proceedings-article","created":{"date-parts":[[2015,2,3]],"date-time":"2015-02-03T08:43:17Z","timestamp":1422952997000},"page":"59-69","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Optimization for performance and energy for batched matrix computations on GPUs"],"prefix":"10.1145","author":[{"given":"Azzam","family":"Haidar","sequence":"first","affiliation":[{"name":"University of Tennessee, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tingxing","family":"Dong","sequence":"additional","affiliation":[{"name":"University of Tennessee, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Piotr","family":"Luszczek","sequence":"additional","affiliation":[{"name":"University of Tennessee, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Stanimire","family":"Tomov","sequence":"additional","affiliation":[{"name":"University of Tennessee, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jack","family":"Dongarra","sequence":"additional","affiliation":[{"name":"University of Tennessee, USA \/ Oak Ridge National Laboratory, USA \/ University of Manchester, UK"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2015,2,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Cheaper, Better \u2013 a Hybridization Methodology to Develop Linear Algebra Software for GPUs. In W. mei W","author":"Agullo E.","year":"2010","unstructured":"E. Agullo, C. Augonnet, J. Dongarra, H. Ltaief, R. Namyst, S. Thibault, and S. Tomov. Faster, Cheaper, Better \u2013 a Hybridization Methodology to Develop Linear Algebra Software for GPUs. In W. mei W. Hwu, editor, GPU Computing Gems, volume 2. Morgan Kaufmann, Sept. 2010."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/180\/1\/012037"},{"key":"e_1_3_2_1_3_1","unstructured":"ACML - AMD Core Math Library 2014. Available at http:\/\/developer.amd.com\/tools-and-sdks\/cpu-development\/ amd-core-math-library-acml."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.5555\/323215"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2012.11"},{"issue":"119","key":"e_1_3_2_1_6_1","article-title":"Stability of methods for matrix inversion","volume":"12","author":"Croz D.","year":"1992","unstructured":"D. Croz, J. J. Dongarra, and N. J. Higham. Stability of methods for matrix inversion. IMA J. Numer. Anal., 12(119), 1992.","journal-title":"IMA J. Numer. Anal."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2014.103"},{"key":"e_1_3_2_1_8_1","volume-title":"Exploiting fine-grain parallelism in recursive LU factorization","author":"Dongarra J.","year":"2011","unstructured":"J. Dongarra, M. Faverge, H. Ltaief, and P. Luszczek. Exploiting fine-grain parallelism in recursive LU factorization. In ParCo 2011 \u2013 International Conference on Parallel Computing, Ghent, Belgium, August 30-September 2 2011."},{"key":"e_1_3_2_1_9_1","first-page":"429","article-title":"Exploiting fine-grain parallelism in recursive LU factorization. Advances in Parallel Computing","volume":"22","author":"Dongarra J.","year":"2012","unstructured":"J. Dongarra, M. Faverge, H. Ltaief, and P. Luszczek. Exploiting fine-grain parallelism in recursive LU factorization. Advances in Parallel Computing, Special Issue, 22:429\u2013436, 2012. ISBN 978-1-61499-040-6 (print); ISBN 978-1-61499-041-3 (online).","journal-title":"Special Issue"},{"issue":"1","key":"e_1_3_2_1_10_1","article-title":"Model-driven one-sided factorizations on multicore accelerated systems","volume":"1","author":"Dongarra J.","year":"2014","unstructured":"J. Dongarra, A. Haidar, J. Kurzak, P. Luszczek, S. Tomov, and A. YarKhan. Model-driven one-sided factorizations on multicore accelerated systems. International Journal on Supercomputing Frontiers and Innovations, 1(1), June 2014.","journal-title":"International Journal on Supercomputing Frontiers and Innovations"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342004041296"},{"key":"e_1_3_2_1_12_1","unstructured":"Matrix algebra on GPU and multicore architectures (MAGMA) 2014. Available at http:\/\/icl.cs.utk.edu\/magma\/."},{"key":"e_1_3_2_1_13_1","unstructured":"Intel Pentium III Processor - Small Matrix Library 1999. Available at http:\/\/www.intel.com\/design\/pentiumiii\/sml\/."},{"key":"e_1_3_2_1_14_1","unstructured":"Intel Math Kernel Library 2014. Available at http:\/\/software.intel.com\/intel-mkl\/."},{"key":"e_1_3_2_1_15_1","unstructured":"Intel R 64 and IA-32 architectures software developer\u2019s manual July 20 2014. Available at http:\/\/download.intel.com\/products\/processor\/manual\/."},{"key":"e_1_3_2_1_16_1","first-page":"1574","volume-title":"Proceedings of 2012 IEEE High Performance Extreme Computing Conference (HPEC 2012","author":"Luszczek P.","year":"2012","unstructured":"P. Luszczek and J. Dongarra. Anatomy of a globally recursive embedded LINPACK benchmark. In Proceedings of 2012 IEEE High Performance Extreme Computing Conference (HPEC 2012), Westin Hotel, Waltham, Massachusetts, September 10-12 2012. IEEE Catalog Number: CFP12HPE-CDR, ISBN: 978-1-4673-1574-6."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-36803-5_6"},{"key":"e_1_3_2_1_18_1","volume-title":"Poster: A batched Cholesky solver for local RX anomaly detection on GPUs","author":"Molero J.","year":"2013","unstructured":"J. Molero, E. Garz\u00f3n, I. Garc\u00b4\u0131a, E. Quintana-Ort\u00b4\u0131, and A. Plaza. Poster: A batched Cholesky solver for local RX anomaly detection on GPUs, 2013. PUMPS."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342010385729"},{"volume-title":"\/\/developer.nvidia.com\/nvidia-management-library-nvml","year":"2014","key":"e_1_3_2_1_20_1","unstructured":"Available at https: \/\/developer.nvidia.com\/nvidia-management-library-nvml, 2014."},{"key":"e_1_3_2_1_21_1","unstructured":"CUBLAS 2014. Available at http:\/\/docs.nvidia.com\/cuda\/cublas\/."},{"key":"e_1_3_2_1_22_1","unstructured":"The OpenACC TM application programming interface version 1.0 November 2011."},{"key":"e_1_3_2_1_23_1","unstructured":"OpenMP application program interface July 2013. Version 4.0."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-40047-6_81"},{"key":"e_1_3_2_1_25_1","volume-title":"IEEE International Conference on Cluster Computing (CLUSTER 2013","author":"Oreste V.","year":"2013","unstructured":"V. Oreste, N. A. Gawande, and A. Tumeo. Accelerating subsurface transport simulation on heterogeneous clusters. In IEEE International Conference on Cluster Computing (CLUSTER 2013), Indianapolis, Indiana, September, 23-27 2013."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2012.12"},{"key":"e_1_3_2_1_27_1","volume-title":"Proc. of the IEEE IPDPS\u201910","author":"Tomov S.","year":"2014","unstructured":"S. Tomov, R. Nath, and J. Dongarra. Dense linear algebra solvers for multicore with GPU accelerators. In Proc. of the IEEE IPDPS\u201910, Atlanta, GA, April 19-23 2014."},{"key":"e_1_3_2_1_28_1","unstructured":"V. Volkov and J. W. Demmel. LU QR and Cholesky factorizations using vector capabilities of GPUs. Technical Report UCB\/EECS-2008-49 University of California Berkeley May 13 2008. Also available as LAPACK Working Note 202."},{"volume-title":"April, 2013. GTC\u201913 \u2013 ID S3069","author":"Wainwright I.","key":"e_1_3_2_1_29_1","unstructured":"I. Wainwright. Optimized LU-decomposition with full pivot for small batched matrices, April, 2013. GTC\u201913 \u2013 ID S3069."},{"key":"e_1_3_2_1_30_1","unstructured":"S. N. Yeralan T. A. Davis and S. Ranka. Sparse mulitfrontal QR on the GPU. Technical report University of Florida Technical Report 2013."}],"event":{"name":"GPGPU-8: General-purpose Processing with Graphics Processing Units 8","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages"],"location":"San Francisco CA USA","acronym":"GPGPU-8"},"container-title":["Proceedings of the 8th Workshop on General Purpose Processing using GPUs"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2716282.2716288","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2716282.2716288","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2716282.2716288","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T09:37:35Z","timestamp":1763458655000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2716282.2716288"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,2,7]]},"references-count":30,"alternative-id":["10.1145\/2716282.2716288","10.1145\/2716282"],"URL":"https:\/\/doi.org\/10.1145\/2716282.2716288","relation":{},"subject":[],"published":{"date-parts":[[2015,2,7]]},"assertion":[{"value":"2015-02-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}