{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,2]],"date-time":"2026-07-02T23:48:37Z","timestamp":1783036117464,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":25,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,2,4]],"date-time":"2018-02-04T00:00:00Z","timestamp":1517702400000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100007065","name":"Nvidia","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100007065","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["ACI-1339822"],"award-info":[{"award-number":["ACI-1339822"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000015","name":"U.S. Department of Energy","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2017,2,4]]},"DOI":"10.1145\/3038228.3038237","type":"proceedings-article","created":{"date-parts":[[2017,2,16]],"date-time":"2017-02-16T10:18:54Z","timestamp":1487240334000},"page":"42-52","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["High-performance Cholesky factorization for GPU-only execution"],"prefix":"10.1145","author":[{"given":"Azzam","family":"Haidar","sequence":"first","affiliation":[{"name":"University of Tennessee, U.S.A."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ahmad","family":"Abdelfatah","sequence":"additional","affiliation":[{"name":"University of Tennessee, U.S.A."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Stanimire","family":"Tomov","sequence":"additional","affiliation":[{"name":"University of Tennessee, U.S.A."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jack","family":"Dongarra","sequence":"additional","affiliation":[{"name":"University of Tennessee, U.S.A., Oak Ridge National Laboratory, U.S.A., University of Manchester, UK"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2017,2,4]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2016.190"},{"key":"e_1_3_2_1_2_1","volume-title":"Performance Tuning and Optimization Techniques of Fixed and Variable Size Batched Cholesky Factorization on GPUs. In International Conference on Computational Science (ICCS'16)","author":"Abdelfattah Ahmad","year":"2016","unstructured":"Ahmad Abdelfattah, Azzam Haidar, Stanimire Tomov, and Jack Dongarra. 2016. Performance Tuning and Optimization Techniques of Fixed and Variable Size Batched Cholesky Factorization on GPUs. In International Conference on Computational Science (ICCS'16). San Diego, CA."},{"key":"e_1_3_2_1_3_1","volume-title":"GPU Computing Gems, Wen mei W","author":"Agullo Emmanuel","unstructured":"Emmanuel Agullo, C\u00e9dric Augonnet, Jack Dongarra, Hatem Ltaief, Raymond Namyst, Samuel Thibault, and Stanimire Tomov. 2010. Faster, Cheaper, Better -- a Hybridization Methodology to Develop Linear Algebra Software for GPUs. In GPU Computing Gems, Wen mei W. Hwu (Ed.). Vol. 2. Morgan Kaufmann."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/180\/1\/012037"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","unstructured":"E. Anderson Z. Bai C. Bischof L. S. Blackford J. W. Demmel J. J. Dongarra J. Du Croz A. Greenbaum S. Hammarling A. McKenney and D. Sorensen. 1992. LAPACK Users' Guide. SIAM Philadelphia PA. http:\/\/www.netlib.org\/lapack\/lug\/.","DOI":"10.5555\/130639"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2664666.2664667"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC.2014.30"},{"key":"e_1_3_2_1_8_1","volume-title":"Proc. of 2014 International Conference on Parallel Processing (ICPP-2014)","author":"Dong T.","unstructured":"T. Dong, A. Haidar, S. Tomov, and J. Dongarra. 2014. A Fast Batched Cholesky Factorization on a GPU. In Proc. of 2014 International Conference on Parallel Processing (ICPP-2014)."},{"key":"e_1_3_2_1_9_1","volume-title":"A Proposed API for Batched Basic Linear Algebra Subprograms. MIMS EPrint","author":"Dongarra Jack","year":"2016","unstructured":"Jack Dongarra, Iain Duff, Mark Gates, Azzam Haidar, Sven Hammarling, Nicholas J. Higham, Jonathon Hogg, Pedro Valero-Lara, Samuel D. Relton, Stanimire Tomov, and Mawussi Zounon. 2016. A Proposed API for Batched Basic Linear Algebra Subprograms. MIMS EPrint 2016.25. Manchester Institute for Mathematical Sciences, The University of Manchester, UK. 20 pages. http:\/\/eprints.ma.man.ac.uk\/2464\/"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.14529\/jsfi140105"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/77626.79170"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/1513895.1513901"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2014.58"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","unstructured":"Azzam Haidar Tingxing Dong Piotr Luszczek Stanimire Tomov and Jack Dongarra. Batched matrix computations on hardware accelerators based on GPUs. International Journal of High Performance Computing Applications (02\/2015 ????) 10.1177\/1094342014567546","DOI":"10.1177\/1094342014567546"},{"key":"e_1_3_2_1_15_1","volume-title":"ISC High Performance","author":"Haidar Azzam","unstructured":"Azzam Haidar, Tingxing Dong, Stanimire Tomov, Piotr Luszczek, and Jack Dongarra. 2015. Framework for Batched and GPU-resident Factorization Algorithms to Block Householder Transformations. In ISC High Performance. Springer, Springer, Frankfurt, Germany."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1155\/2015\/502593"},{"key":"e_1_3_2_1_17_1","volume-title":"Performance Analysis and Optimization Techniques for the Intel Knights Landing Xeon Phi. In IEEE High Performance Extreme Computing Conference (HPEC'16)","author":"Haidar Azzam","year":"2016","unstructured":"Azzam Haidar, Stanimire Tomov, Konstantin Arturov, Murat Guney, Shane Story, and Jack Dongarra. 2016. LU, QR, and Cholesky Factorizations: Programming Model, Performance Analysis and Optimization Techniques for the Intel Knights Landing Xeon Phi. In IEEE High Performance Extreme Computing Conference (HPEC'16). IEEE, IEEE, Waltham, MA."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2015.7322444"},{"key":"e_1_3_2_1_19_1","volume-title":"University of Tennessee","author":"Laboratory Innovative Computing","year":"2010","unstructured":"Innovative Computing Laboratory, University of Tennessee 2010. PLASMA Users' Guide, Parallel Linear Algebra Software for Multicore Architectures, Version 2.0. Innovative Computing Laboratory, University of Tennessee. http:\/\/icl.cs.utk.edu\/projectsfiles\/plasma\/pdf\/users_guide.pdf."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2011.71"},{"key":"e_1_3_2_1_21_1","unstructured":"NVIDIA Corporation 2016. cuSOLVER 8.0. (2016). Available at http:\/\/docs.nvidia.com\/cuda\/cusolver\/."},{"key":"e_1_3_2_1_22_1","volume-title":"Lookahead and Algorithmic Blocking Techniques Compared for Parallel Matrix Factorization. In 10th International Conference on Parallel and Distributed Computing and Systems, IASTED","author":"Strazdins Peter E.","year":"1998","unstructured":"Peter E. Strazdins. 1998. Lookahead and Algorithmic Blocking Techniques Compared for Parallel Matrix Factorization. In 10th International Conference on Parallel and Distributed Computing and Systems, IASTED. Las Vegas, USA."},{"key":"e_1_3_2_1_23_1","unstructured":"Erich Strohmaier Jack Dongarra Horst Simon and Martin Meuer. 1993-2016. TOP500 Supercomputer Sites. (1993-2016). Available from: http:\/\/www.top500.org\/."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2009.12.005"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2012.04.005"}],"event":{"name":"PPoPP '17: 22nd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","location":"Austin TX USA","acronym":"PPoPP '17","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages"]},"container-title":["Proceedings of the General Purpose GPUs"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3038228.3038237","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3038228.3038237","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3038228.3038237","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T09:45:45Z","timestamp":1763459145000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3038228.3038237"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,2,4]]},"references-count":25,"alternative-id":["10.1145\/3038228.3038237","10.1145\/3038228"],"URL":"https:\/\/doi.org\/10.1145\/3038228.3038237","relation":{},"subject":[],"published":{"date-parts":[[2017,2,4]]},"assertion":[{"value":"2017-02-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}