{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,2,13]],"date-time":"2024-02-13T08:47:18Z","timestamp":1707814038427},"reference-count":37,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2021,6,1]],"date-time":"2021-06-01T00:00:00Z","timestamp":1622505600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2021,11,25]],"date-time":"2021-11-25T00:00:00Z","timestamp":1637798400000},"content-version":"vor","delay-in-days":177,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"DOI":"10.13039\/100010688","name":"Horizon 2020 European Innovation Council Fast Track to Innovation","doi-asserted-by":"publisher"},{"DOI":"10.13039\/501100010198","name":"Gobierno de Espa\u00f1a Ministerio de Ciencia e Innovaci\u00f3n","doi-asserted-by":"publisher"},{"DOI":"10.13039\/501100010801","name":"Xunta de Galicia","doi-asserted-by":"publisher"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Advances in Engineering Software"],"published-print":{"date-parts":[[2021,6]]},"DOI":"10.1016\/j.advengsoft.2021.102997","type":"journal-article","created":{"date-parts":[[2021,4,19]],"date-time":"2021-04-19T03:59:34Z","timestamp":1618804774000},"page":"102997","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":3,"title":["A new AXT format for an efficient SpMV product using AVX-512 instructions and CUDA"],"prefix":"10.1016","volume":"156","author":[{"given":"E.","family":"Coronado-Barrientos","sequence":"first","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0002-2486-7990","authenticated-orcid":false,"given":"M.","family":"Antonioletti","sequence":"additional","affiliation":[]},{"given":"A.","family":"Garcia-Loureiro","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"12","key":"10.1016\/j.advengsoft.2021.102997_bib0001","doi-asserted-by":"crossref","first-page":"1784","DOI":"10.1002\/nme.4865","article-title":"A new sparse matrix vector multiplication graphics processing unit algorithm designed for finite element problems","volume":"102","author":"Wong","year":"2015","journal-title":"International Journal for Numerical Methods in Engineering"},{"key":"10.1016\/j.advengsoft.2021.102997_sbref0002","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1016\/j.advengsoft.2016.10.002","article-title":"Conjugate gradient method with graphics processing unit acceleration: Cuda vs opencl","volume":"111","author":"Ahamed","year":"2017","journal-title":"Advances in Engineering Software"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0003","doi-asserted-by":"crossref","first-page":"456","DOI":"10.1007\/978-3-319-95168-3_31","article-title":"Practical Implementation of Lattice QCD Simulation on SIMD Machines with Intel AVX-512","author":"Kanamori","year":"2018","journal-title":"Lecture Notes in Computer Science"},{"key":"10.1016\/j.advengsoft.2021.102997_sbref0004","series-title":"Proceedings of the 2015 Spanish Conference on Electron Devices","first-page":"1","article-title":"Implementation of numerical methods for nanoscaled semiconductor device simulation using OpenCL","author":"Coronado-Barrientos","year":"2015"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0005","series-title":"Tech. Rep.","article-title":"Efficient Sparse Matrix-Vector Multiplication on CUDA","author":"Bell","year":"2008"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0006","series-title":"Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis","isbn-type":"print","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1654059.1654078","article-title":"Implementing Sparse Matrix-vector Multiplication on Throughput-oriented Processors","author":"Bell","year":"2009","ISBN":"http:\/\/id.crossref.org\/isbn\/9781605587448"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0007","series-title":"Proceedings of the 2012 IEEE 26th International Parallel and Distributed Processing Symposium Workshops & PhD Forum","isbn-type":"print","first-page":"1696","article-title":"Sparse Matrix-vector Multiplication on GPGPU Clusters: A New Storage Format and a Scalable Implementation","author":"Kreutzer","year":"2012","ISBN":"http:\/\/id.crossref.org\/isbn\/9780769546766"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0008","series-title":"Templates for the Solution of Linear Systems: Building Blocks for Iterative Methods","author":"Barrett","year":"1994"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0009","series-title":"2019 20th International Conference on Parallel and Distributed Computing, Applications and Technologies (PDCAT)","first-page":"121","article-title":"Accelerating conjugate gradient using ompss","author":"Catal\u00e1n","year":"2019"},{"key":"10.1016\/j.advengsoft.2021.102997_sbref0010","doi-asserted-by":"crossref","first-page":"49","DOI":"10.1007\/s11227-012-0761-2","article-title":"The BiConjugate gradient on GPUs","volume":"64","author":"Ortega","year":"2012","journal-title":"The Journal of Supercomputing"},{"issue":"6","key":"10.1016\/j.advengsoft.2021.102997_bib0011","doi-asserted-by":"crossref","first-page":"2823","DOI":"10.1007\/s11227-018-2325-6","article-title":"Improving Performance of Iterative Solvers with the AXC Format Using the Intel Xeon Phi","volume":"74","author":"Coronado-Barrientos","year":"2018","journal-title":"J Supercomput"},{"key":"10.1016\/j.advengsoft.2021.102997_sbref0012","series-title":"Itpack 2.0 User\u2019s Guide","author":"Grimes","year":"1979"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0013","unstructured":"NVIDIA Developer Zone. CUDA Toolkit documentation. https:\/\/docs.nvidia.com\/cuda\/index.html; Accessed: 2020-02-06."},{"key":"10.1016\/j.advengsoft.2021.102997_bib0014","unstructured":"Intel. Intel Intrinsics Guide. https:\/\/software.intel.com\/sites\/landingpage\/IntrinsicsGuide\/#; Accessed: 2020-02-06."},{"key":"10.1016\/j.advengsoft.2021.102997_bib0015","series-title":"Intel Xeon Phi Processor High Performance Programming. Knights Landing Edition","isbn-type":"print","author":"Jeffers","year":"2016","ISBN":"http:\/\/id.crossref.org\/isbn\/9780128091944"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0016","series-title":"CUDA by Example: An Introduction to General-Purpose GPU Programming","isbn-type":"print","author":"Sanders","year":"2011","ISBN":"http:\/\/id.crossref.org\/isbn\/9780131387683"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0017","first-page":"1081","article-title":"The sparse matrix vector product on GPUs","volume":"2","author":"V\u00e1zquez","year":"2009","journal-title":"Proceedings of the 2009 International Conference on Computational and Mathematical Methods in Science and Engineering"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0018","series-title":"Proceedings of the 2010 10th IEEE International Conference on Computer and Information Technology","isbn-type":"print","doi-asserted-by":"crossref","first-page":"1146","DOI":"10.1109\/CIT.2010.208","article-title":"Improving the performance of the sparse matrix vector with GPUs","author":"V\u00e1zquez","year":"2010","ISBN":"http:\/\/id.crossref.org\/isbn\/9780769541082"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0019","series-title":"High Performance Embedded Architectures and Compilers: 5th International Conference","isbn-type":"print","doi-asserted-by":"crossref","first-page":"111","DOI":"10.1007\/978-3-642-11515-8_10","article-title":"Automatically Tuning Sparse Matrix-Vector Multiplication for GPU Architectures","author":"Monakov","year":"2010","ISBN":"http:\/\/id.crossref.org\/isbn\/9783642115158"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0020","series-title":"ACM Sigplan Notices","first-page":"115","article-title":"Model-driven autotuning of sparse matrix-vector multiply on GPUs.","author":"Choi","year":"2010"},{"key":"10.1016\/j.advengsoft.2021.102997_sbref0021","series-title":"On efficient data structures for sparse matrix storage","author":"Hossain","year":"2006"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0022","series-title":"Introduction to High Performance Computing for Scientifics and Engineers","isbn-type":"print","author":"Hager","year":"2011","ISBN":"http:\/\/id.crossref.org\/isbn\/9781439811924"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0023","doi-asserted-by":"crossref","first-page":"C401","DOI":"10.1137\/130930352","article-title":"A unified sparse matrix data format for modern processors with wide SIMD units","volume":"36","author":"Kreutzer","year":"2014","journal-title":"SIAM Journal on Scientific Computing"},{"issue":"1","key":"10.1016\/j.advengsoft.2021.102997_bib0024","doi-asserted-by":"crossref","first-page":"57","DOI":"10.1002\/nme.5346","article-title":"An efficient sparse matrix-vector multiplication on CUDA-enabled graphic processing units for finite element method simulations","volume":"110","author":"Altinkaynak","year":"2017","journal-title":"International Journal for Numerical Methods in Engineering"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0025","series-title":"CSR5: An Efficient Storage Format for Cross-Platform Sparse Matrix-Vector Multiplication","author":"Liu","year":"2015"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0026","doi-asserted-by":"crossref","DOI":"10.1002\/cpe.4864","article-title":"AXC: A new format to perform the SpMV oriented to Intel Xeon Phi architecture in OpenCL","volume":"31","author":"Coronado-Barrientos","year":"2019","journal-title":"Concurrency and Computation: Practice and Experience"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0027","series-title":"OpenMP: Portable Multi-Level Parallelism on Modern Systems","isbn-type":"print","doi-asserted-by":"crossref","first-page":"115","DOI":"10.1007\/978-3-030-58144-2_8","article-title":"Towards an Auto-Tuned and Task-Based SpMV (LASs Library)","author":"Catal\u00e1n","year":"2020","ISBN":"http:\/\/id.crossref.org\/isbn\/9783030581442"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0028","unstructured":"Lin Y., Grover V.. Using CUDA Warp-Level Primitives. https:\/\/devblogs.nvidia.com\/using-cuda-warp-level-primitives\/; Accessed: 2020-02-06."},{"key":"10.1016\/j.advengsoft.2021.102997_bib0029","series-title":"Tech. Rep.","article-title":"Efficient Parallel Scan Algorithms for GPUs","author":"S. Sengupta and M. Harris and M. Garland","year":"2008"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0030","series-title":"GPU Gems 3","first-page":"39","article-title":"Parallel Prefix Sum (Scan) with CUDA","author":"Sengupta","year":"2007"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0031","series-title":"Structured Parallel Programming Patterns for Efficient Computation","isbn-type":"print","author":"McCool","year":"2012","ISBN":"http:\/\/id.crossref.org\/isbn\/9780124159938"},{"key":"10.1016\/j.advengsoft.2021.102997_bib0032","unstructured":"Edinburgh Parallel Computing Centre (EPCC). Cirrus. http:\/\/www.cirrus.ac.uk\/; Accessed: 2020-03-02."},{"key":"10.1016\/j.advengsoft.2021.102997_bib0033","unstructured":"Intel. Intel Xeon Gold 6148 Processor. https:\/\/ark.intel.com\/content\/www\/us\/en\/ark\/products\/120489\/intel-xeon-gold-6148-processor-27-5m-cache-2-40-ghz.html; Accessed: 2020-03-02."},{"key":"10.1016\/j.advengsoft.2021.102997_bib0034","unstructured":"WikiChip. Intel Xeon Gold 6148. https:\/\/en.wikichip.org\/wiki\/intel\/xeon_gold\/6148#Memory_controller."},{"key":"10.1016\/j.advengsoft.2021.102997_bib0035","series-title":"High Performance Parallelism Pearls Multicore and Many-core Programming Approaches","isbn-type":"print","author":"Reinders","year":"2015","ISBN":"http:\/\/id.crossref.org\/isbn\/9780128021187"},{"key":"10.1016\/j.advengsoft.2021.102997_sbref0036","series-title":"V100 GPU","article-title":"NVIDIA Tesla","author":"NVIDIA","year":"2020"},{"key":"10.1016\/j.advengsoft.2021.102997_sbref0037","series-title":"SuiteSparse Matrix Collection","author":"Texas A&M University","year":"2020"}],"container-title":["Advances in Engineering Software"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0965997821000260?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0965997821000260?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2021,12,7]],"date-time":"2021-12-07T03:06:03Z","timestamp":1638846363000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0965997821000260"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6]]},"references-count":37,"alternative-id":["S0965997821000260"],"URL":"http:\/\/dx.doi.org\/10.1016\/j.advengsoft.2021.102997","relation":{},"ISSN":["0965-9978"],"issn-type":[{"value":"0965-9978","type":"print"}],"subject":[],"published":{"date-parts":[[2021,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"A new AXT format for an efficient SpMV product using AVX-512 instructions and CUDA","name":"articletitle","label":"Article Title"},{"value":"Advances in Engineering Software","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.advengsoft.2021.102997","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2021 The Authors. Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"102997"}}