{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T06:09:46Z","timestamp":1763705386011,"version":"3.37.3"},"reference-count":42,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"5","license":[{"start":{"date-parts":[[2018,5,1]],"date-time":"2018-05-01T00:00:00Z","timestamp":1525132800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2018,5,1]],"date-time":"2018-05-01T00:00:00Z","timestamp":1525132800000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2018,5,1]],"date-time":"2018-05-01T00:00:00Z","timestamp":1525132800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2018,5,1]],"date-time":"2018-05-01T00:00:00Z","timestamp":1525132800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Science Foundation","award":["CSR 1514286","OAC 1740250"],"award-info":[{"award-number":["CSR 1514286","OAC 1740250"]}]},{"DOI":"10.13039\/100007065","name":"NVIDIA","doi-asserted-by":"crossref","id":[{"id":"10.13039\/100007065","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/100000015","name":"Department of Energy","doi-asserted-by":"crossref","id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2018,5,1]]},"DOI":"10.1109\/tpds.2017.2783929","type":"journal-article","created":{"date-parts":[[2017,12,15]],"date-time":"2017-12-15T22:17:33Z","timestamp":1513376253000},"page":"973-984","source":"Crossref","is-referenced-by-count":21,"title":["A Guide for Achieving High Performance with Very Small Matrices on GPU: A Case Study of Batched LU and Cholesky Factorizations"],"prefix":"10.1109","volume":"29","author":[{"given":"Azzam","family":"Haidar","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5054-4784","authenticated-orcid":false,"given":"Ahmad","family":"Abdelfattah","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6955-1500","authenticated-orcid":false,"given":"Mawussi","family":"Zounon","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Stanimire","family":"Tomov","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jack","family":"Dongarra","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"article-title":"Hybrid multicore cholesky factorization\n with multiple GPU accelerators","year":"0","author":"ltaief","key":"ref39"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"ref33","article-title":"Faster, cheaper, better &#x2013; a\n hybridization methodology to develop linear algebra software for GPUs","volume":"2","author":"agullo","year":"2010","journal-title":"GPU Computing Gems"},{"article-title":"LU, QR and Cholesky factorizations using vector capabilities of GPUs","year":"2008","author":"volkov","key":"ref32"},{"key":"ref31","article-title":"Dense\n linear algebra solvers for multicore with GPU accelerators","author":"tomov","year":"2014","journal-title":"Proc IEEE Int Symp Parallel Distrib Process Workshops PhD Forum"},{"year":"1999","key":"ref30"},{"article-title":"Optimized LU-decomposition with full pivot for small batched\n matrices","year":"2013","author":"wainwright","key":"ref37"},{"key":"ref36","first-page":"1","article-title":"Accelerating subsurface transport simulation on heterogeneous clusters","author":"oreste","year":"2013","journal-title":"Proc IEEE Int Conf Cluster Comput"},{"key":"ref35","first-page":"813","article-title":"Power\/performance\n trade-offs of small batched LU based solvers on GPUs","volume":"8097","author":"oreste","year":"2013","journal-title":"Proc 19th Int Conf Parallel Process"},{"key":"ref34","first-page":"85","article-title":"Model-driven one-sided factorizations on multicore accelerated systems","volume":"1","author":"dongarra","year":"2014","journal-title":"International Journal on Supercomputing Frontiers and Innovations"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.1881"},{"key":"ref40","doi-asserted-by":"crossref","first-page":"606","DOI":"10.1016\/j.procs.2017.05.250","article-title":"Factorization and inversion of a\n million matrices using GPUs: Challenges and countermeasures","volume":"108","author":"abdelfattah","year":"2017","journal-title":"Procedia Comput Sci"},{"article-title":"Distributed graph-based density. matrix calculation for quantum. molecular dynamics using GPUS","year":"2016","author":"mniszewski","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1063\/1.4952650"},{"key":"ref13","article-title":"cuDNN: Efficient primitives for deep learning","author":"chetlur","year":"2014","journal-title":"CoRR"},{"year":"2016","key":"ref14"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1021\/ct400331r"},{"article-title":"Efficient\n multi-level heterogeneous parallelization of molecular dynamics in gromacs.","year":"0","author":"p\u00e1ll","key":"ref16"},{"key":"ref17","article-title":"Tackling exascale software challenges in molecular dynamics\n simulations with GROMACS","author":"p\u00e1ll","year":"2015","journal-title":"CoRR"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1088\/1749-4699\/2\/1\/015001"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1088\/1749-4699\/6\/1\/015003"},{"year":"2014","key":"ref28"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2014.2328614"},{"article-title":"A proposed API for batched basic linear\n algebra subprograms","year":"2016","author":"dongarra","key":"ref27"},{"key":"ref3","first-page":"92","article-title":"Multicore and\n accelerator development for a leadership-class stellar astrophysics code","author":"messer","year":"2012","journal-title":"Proc 11th Int Conf Appl Parallel Sci Comput State-of-the-Art Scientific Parallel Comput"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2014.103"},{"year":"2014","key":"ref29"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2012.11"},{"article-title":"Sparse\n mulitfrontal QR on the GPU","year":"2013","author":"yeralan","key":"ref8"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1080\/00268970500275780"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2915921"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/2712386.2712391"},{"key":"ref1","article-title":"cuDNN: Efficient primitives for deep learning","author":"chetlur","year":"2014","journal-title":"CoRR"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2015.09.013"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2009.12.005"},{"key":"ref21","doi-asserted-by":"crossref","first-page":"862","DOI":"10.1086\/306692","article-title":"Silicon burning II: Quasi-equilibrium and\n explosive burning","volume":"511","author":"raphael","year":"1999","journal-title":"ApJ"},{"key":"ref42","first-page":"511","article-title":"Optimized batched linear algebra for modern architectures","author":"dongarra","year":"2017","journal-title":"Proc 23rd Int Conf Parallel Distrib Comput Santiago de Compostela"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC.2014.30"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2017.05.138"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2014.52"},{"year":"2015","key":"ref26"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/2688500.2688534"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielaam\/71\/8334163\/8214236-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/8334163\/08214236.pdf?arnumber=8214236","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,8]],"date-time":"2022-04-08T18:48:44Z","timestamp":1649443724000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8214236\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,5,1]]},"references-count":42,"journal-issue":{"issue":"5"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2017.2783929","relation":{},"ISSN":["1045-9219"],"issn-type":[{"type":"print","value":"1045-9219"}],"subject":[],"published":{"date-parts":[[2018,5,1]]}}}