{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T09:52:37Z","timestamp":1763459557663,"version":"3.45.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2016,2,7]],"date-time":"2016-02-07T00:00:00Z","timestamp":1454803200000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["Grant No. ACI-1339822"],"award-info":[{"award-number":["Grant No. ACI-1339822"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Russian Scientific Fund","award":["N14-11-00190"],"award-info":[{"award-number":["N14-11-00190"]}]},{"DOI":"10.13039\/100000015","name":"U.S. Department of Energy","doi-asserted-by":"publisher","award":["DE-SC0010042"],"award-info":[{"award-number":["DE-SC0010042"]}],"id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2015,2,7]]},"DOI":"10.1145\/2712386.2712387","type":"proceedings-article","created":{"date-parts":[[2015,1,28]],"date-time":"2015-01-28T09:12:26Z","timestamp":1422436346000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Energy efficiency and performance frontiers for sparse computations on GPU supercomputers"],"prefix":"10.1145","author":[{"given":"Hartwig","family":"Anzt","sequence":"first","affiliation":[{"name":"University of Tennessee, Knoxville"}]},{"given":"Stanimire","family":"Tomov","sequence":"additional","affiliation":[{"name":"University of Tennessee, Knoxville"}]},{"given":"Jack","family":"Dongarra","sequence":"additional","affiliation":[{"name":"University of Tennessee, Knoxville"}]}],"member":"320","published-online":{"date-parts":[[2015,2,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342010391989"},{"key":"e_1_3_2_1_2_1","unstructured":"The green 500 list http:\/\/www.green500.org\/."},{"key":"e_1_3_2_1_3_1","unstructured":"The top 500 list http:\/\/www.top.org\/."},{"key":"e_1_3_2_1_4_1","volume-title":"June","author":"Dongarra J.","year":"2013","unstructured":"J. Dongarra and M. A. Heroux, \"Toward a New Metric for Ranking High Performance Computing Systems,\" SANDIA REPORT SAND2013-4744, June 2013."},{"key":"e_1_3_2_1_5_1","volume-title":"Unveiling the performance-energy tradeoff in iterative linear system solvers for multithreaded processors,\" Concurrency and Computation: Practice and Experience","author":"Aliaga J.","year":"2014","unstructured":"J. Aliaga, H. Anzt, M. Castillo, J. Fern\u00e1ndez, G. L\u00e9on, J. P\u00e9rez, and E. Quintana-Ort\u00ed, \"Unveiling the performance-energy tradeoff in iterative linear system solvers for multithreaded processors,\" Concurrency and Computation: Practice and Experience, 2014."},{"key":"e_1_3_2_1_6_1","volume-title":"Sparse BLAS and Sparse Solver Performance Charts: DCSRGEMV and DCSRMM","author":"Kernel Library Intel\u00ae Math","year":"2014","unstructured":"\"Intel\u00ae Math Kernel Library. Sparse BLAS and Sparse Solver Performance Charts: DCSRGEMV and DCSRMM,\" October 2014. {Online}. Available: https:\/\/software.intel.com\/en-us\/intel-mkl"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/1970638"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2014.48"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1137\/S1064827500366124"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1504\/IJCSE.2006.012774"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2008.01.018"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2005.1"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/1188455.1188504"},{"key":"e_1_3_2_1_14_1","unstructured":"A. Knyazev. https:\/\/code.google.com\/p\/blopex\/."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1137\/060661624"},{"key":"e_1_3_2_1_16_1","unstructured":"I. C. Lab \"Software distribution of MAGMA version 1.5 \" http:\/\/icl.cs.utk.edu\/magma\/ 2014."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2009.353"},{"key":"e_1_3_2_1_18_1","first-page":"276","volume-title":"2010 18th IEEE\/IFIP","author":"Jim\u00e9nez V.","year":"2010","unstructured":"V. Jim\u00e9nez, R. Gioiosa, E. Kursun, F. Cazorla, C.-Y. Cher, A. Buyuktosunoglu, P. Bose, and M. Valero, \"Trends and techniques for energy efficient architectures,\" in VLSI System on Chip Conference (VLSI-SoC), 2010 18th IEEE\/IFIP, Sept 2010, pp. 276--279."},{"key":"e_1_3_2_1_19_1","first-page":"56","volume-title":"2013 IEEE International Symposium on","author":"Kestor G.","year":"2013","unstructured":"G. Kestor, R. Gioiosa, D. Kerbyson, and A. Hoisie, \"Quantifying the energy cost of data movement in scientific applications,\" in Workload Characterization (IISWC), 2013 IEEE International Symposium on, Sept 2013, pp. 56--65."},{"key":"e_1_3_2_1_20_1","first-page":"1","article-title":"Evaluating the performance and energy efficiency of the cosmo-art model system","author":"Charles J.","year":"2014","unstructured":"J. Charles, W. Sawyer, M. F. Dolz, and S. Catal\u00e1n, \"Evaluating the performance and energy efficiency of the cosmo-art model system,\" Computer Science - Research and Development, pp. 1--10, 2014.","journal-title":"Computer Science - Research and Development"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.5194\/gmd-4-1077-2011"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10586-012-0219-6"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063482"},{"key":"e_1_3_2_1_24_1","first-page":"7664","article-title":"An analysis of energy-optimized lattice-boltzmann cfd simulations from the chip to the highly parallel level","volume":"1304","author":"Wittmann M.","year":"2013","unstructured":"M. Wittmann, G. Hager, T. Zeiser, and G. Wellein, \"An analysis of energy-optimized lattice-boltzmann cfd simulations from the chip to the highly parallel level,\" CoRR, vol. abs\/1304.7664, 2013.","journal-title":"CoRR"},{"key":"e_1_3_2_1_25_1","unstructured":"NV CUSPARSE LIBRARY July 2013."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1002\/pamm.201210004"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.5555\/248979"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.5555\/323215"},{"key":"e_1_3_2_1_29_1","first-page":"2159","article-title":"Communication-avoiding parallel and sequential QR factorizations","volume":"0806","author":"Demmel J.","year":"2008","unstructured":"J. Demmel, L. Grigori, M. Hoemmen, and J. Langou, \"Communication-avoiding parallel and sequential QR factorizations,\" CoRR, vol. abs\/0806.2159, 2008.","journal-title":"CoRR"},{"key":"e_1_3_2_1_30_1","volume-title":"Communication-avoiding QR decomposition for GPUs,\" EECS Department","author":"Anderson M.","year":"2010","unstructured":"M. Anderson, G. Ballard, J. Demmel, and K. Keutzer, \"Communication-avoiding QR decomposition for GPUs,\" EECS Department, UC, Berkeley, Tech. Rep. UCB\/EECS-2010-131, Oct 2010."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2011.90"},{"key":"e_1_3_2_1_32_1","unstructured":"E. Jones T. Oliphant P. Peterson et al. \"SciPy: Open source scientific tools for Python \" 2001--. {Online: http:\/\/www.scipy.org\/}."},{"volume-title":"Octopus: a tool for the application of time-dependent density functional theory,\" phys. stat. sol. (b)","author":"A. Castro","key":"e_1_3_2_1_33_1","unstructured":"A. Castro et al, \"Octopus: a tool for the application of time-dependent density functional theory,\" phys. stat. sol. (b), vol. 243, no. 11, pp. 2465--2488, 2006."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/1527286.1527287"},{"key":"e_1_3_2_1_35_1","unstructured":"M. Heroux et al \"An Overview of Trilinos \" Sandia National Laboratories Tech. Rep. SAND2003--2927 2003."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0927-0256(02)00325-7"},{"key":"e_1_3_2_1_37_1","volume-title":"NVIDIA CUDA Compute Unified Device Architecture Programming Guide","year":"2014","unstructured":"NVIDIA CUDA Compute Unified Device Architecture Programming Guide, 6th ed., NVIDIA Corporation, April 2014.","edition":"6"},{"key":"e_1_3_2_1_38_1","first-page":"104","article-title":"Preconditioned eigensolvers - an oxymoron?","volume":"7","author":"Knyazev A. V.","year":"1998","unstructured":"A. V. Knyazev, \"Preconditioned eigensolvers - an oxymoron?\" ETNA, vol. 7, pp. 104--123, 1998.","journal-title":"ETNA"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2006.02.007"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.apnum.2004.09.026"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1002\/pamm.201110360"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"T. V. Kolev and P. S. Vassilevski \"Parallel eigensolver for H(curl) problems using H1-auxiliary space AMG preconditioning \" LLNL Livermore CA Tech. Rep. UCRL-TR-226197 2006.","DOI":"10.2172\/900179"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.5555\/865735"},{"key":"e_1_3_2_1_44_1","volume-title":"Implementing a Sparse Matrix Vector Product for the SELL-C\/SELL-C-&sigma","author":"Anzt H.","year":"2014","unstructured":"H. Anzt, S. Tomov, and J. Dongarra, \"Implementing a Sparse Matrix Vector Product for the SELL-C\/SELL-C-&sigma; formats on NVIDIA GPUs,\" University of Tennessee, Tech. Rep. ut-eecs-14-727, March 2014."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611971538"},{"key":"e_1_3_2_1_46_1","volume-title":"Efficient sparse matrix-vector multiplication on CUDA","author":"Bell N.","year":"2008","unstructured":"N. Bell and M. Garland, \"Efficient sparse matrix-vector multiplication on CUDA,\" Dec. 2008."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-11515-8_10"},{"key":"e_1_3_2_1_48_1","first-page":"6209","article-title":"A unified sparse matrix data format for modern processors with wide simd units","volume":"1307","author":"Kreutzer M.","year":"2013","unstructured":"M. Kreutzer, G. Hager, G. Wellein, H. Fehske, and A. R. Bishop, \"A unified sparse matrix data format for modern processors with wide simd units,\" CoRR, vol. abs\/1307.6209, 2013.","journal-title":"CoRR"},{"key":"e_1_3_2_1_49_1","volume-title":"July","author":"N. Corp.","year":"2013","unstructured":"N. Corp., NVIDIA CUDA TOOLKIT V6.0, July 2013."},{"key":"e_1_3_2_1_50_1","volume-title":"October","author":"Kernel Intel\u00ae Math","year":"2007","unstructured":"\"Intel\u00ae Math Kernel Library for Linux* OS,\" Document Number: 314774--005US, October 2007, Intel Corporation."},{"key":"e_1_3_2_1_51_1","unstructured":"(2014) Piz Daint Computing Resources. Swiss National Computing Centre."},{"key":"e_1_3_2_1_52_1","unstructured":"G. Fourestey B. Cumming L. Gilly and T. C. Schulthess. (2014 August) First Experiences With Validating and Using the Cray Power Management Database Tool."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342010385729"},{"key":"e_1_3_2_1_54_1","volume-title":"Mixed-precision orthogonalization scheme and adaptive step size for ca-gmres on gpus,\" VECPAR 2014 (Accepted), jan","author":"Yamazaki I.","year":"2014","unstructured":"I. Yamazaki, S. Tomov, T. Dong, and J. Dongarra, \"Mixed-precision orthogonalization scheme and adaptive step size for ca-gmres on gpus,\" VECPAR 2014 (Accepted), jan 2014."}],"event":{"name":"PPoPP '15: 20th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages"],"location":"San Francisco California","acronym":"PPoPP '15"},"container-title":["Proceedings of the Sixth International Workshop on Programming Models and Applications for Multicores and Manycores"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2712386.2712387","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2712386.2712387","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2712386.2712387","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T09:47:18Z","timestamp":1763459238000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2712386.2712387"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,2,7]]},"references-count":54,"alternative-id":["10.1145\/2712386.2712387","10.1145\/2712386"],"URL":"https:\/\/doi.org\/10.1145\/2712386.2712387","relation":{},"subject":[],"published":{"date-parts":[[2015,2,7]]},"assertion":[{"value":"2015-02-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}