{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T13:51:39Z","timestamp":1770990699771,"version":"3.50.1"},"reference-count":29,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2014,11,6]],"date-time":"2014-11-06T00:00:00Z","timestamp":1415232000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Front. Comput. Sci."],"published-print":{"date-parts":[[2015,6]]},"DOI":"10.1007\/s11704-014-4127-1","type":"journal-article","created":{"date-parts":[[2014,11,14]],"date-time":"2014-11-14T11:40:22Z","timestamp":1415965222000},"page":"431-441","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Memory bandwidth optimization of SpMV on GPGPUs"],"prefix":"10.1007","volume":"9","author":[{"given":"Chenggang Clarence","family":"Yan","sequence":"first","affiliation":[]},{"given":"Hui","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Weizhi","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Yingping","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Bochuan","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Zhu","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Yuxuan","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Jian","family":"Yin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2014,11,6]]},"reference":[{"key":"4127_CR1","first-page":"30","volume-title":"Proceedings of the 18th IEEE International Conference on Parallel and Distributed Systems","author":"W Xu","year":"2012","unstructured":"Xu W, Liu Z, Wu J, Ye X, Jiao S, Wang D, Song F, Fan D. Auto-tuning GEMV on many-core GPU. In: Proceedings of the 18th IEEE International Conference on Parallel and Distributed Systems. 2012, 30\u201336"},{"issue":"5","key":"4127_CR2","doi-asserted-by":"crossref","first-page":"573","DOI":"10.1109\/LSP.2014.2310494","volume":"21","author":"C G Yan","year":"2014","unstructured":"Yan C G, Zhang Y D, Xu J Z, Dai F, Li L, Dai Q H, Wu F. A highly parallel framework for HEVC coding unit partitioning tree decision on many-core processors. IEEE Signal Processing letters, 2014, 21(5): 573\u2013576","journal-title":"IEEE Signal Processing letters"},{"key":"4127_CR3","doi-asserted-by":"crossref","unstructured":"Yan C G, Zhang Y D, Dai F, Zhang J, Li L, Dai Q H. Efficient parallel HEVC intra prediction on many-core processor. Electronics Letters (in press)","DOI":"10.1049\/el.2014.0611"},{"key":"4127_CR4","first-page":"1","volume":"99","author":"C Yan","year":"2014","unstructured":"Yan C, Zhang Y, Xu J, Dai F, Zhang J, Dai Q, Wu F. Efficient parpallel framework for HEVC motion estimation on many-core processors. IEEE Transactions on Circuits and Systems for Video Technology, 2014, 99: 1","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"4127_CR5","first-page":"63","volume-title":"Proceedings of Data Compression Conference","author":"C G Yan","year":"2013","unstructured":"Yan C G, Zhang Y D, Dai F, Li L. Highly parallel framework for HEVC motion estimation on many-core platform. In: Proceedings of Data Compression Conference, 2013, 63\u201372"},{"issue":"3","key":"4127_CR6","doi-asserted-by":"crossref","first-page":"510","DOI":"10.1109\/TMM.2012.2190391","volume":"14","author":"Y D Zhang","year":"2012","unstructured":"Zhang Y D, Yan C G, Dai F, Ma Y. Efficient parallel framework for H.264\/AVC deblocking filter on many-core platform. IEEE Transactions on Multimedia, 2012, 14(3): 510\u2013524","journal-title":"IEEE Transactions on Multimedia"},{"key":"4127_CR7","first-page":"1","volume-title":"Proceedings of the International Conference on Multimedia and Expo","author":"C G Yan","year":"2011","unstructured":"Yan C G, Dai F, Zhang Y D, Ma Y, Chen L C, Fan L J, Zheng Y S. Parallel deblocking filter for H.264\/AVC implemented on Tile64 platform. In: Proceedings of the International Conference on Multimedia and Expo. 2011, 1\u201368"},{"key":"4127_CR8","first-page":"18","volume-title":"Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis","author":"N Bell","year":"2009","unstructured":"Bell N, Garland M. Implementing sparse matrix-vector multiplication on throughput-oriented processors. In: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis. 2009, 18"},{"key":"4127_CR9","volume-title":"Compute Unified Device Architecture Programming Guide","author":"C Nvidia","year":"2007","unstructured":"Nvidia C. Compute Unified Device Architecture Programming Guide. 2007"},{"key":"4127_CR10","volume-title":"Dissertation for the Doctoral Degree","author":"E Im","year":"2000","unstructured":"Im E. Optimizing the performance of sparse matrix-vector multiplication. Dissertation for the Doctoral Degree. Berkeley: University of California, 2000"},{"key":"4127_CR11","volume-title":"Dissertation for the Doctoral Degree","author":"R W Vuduc","year":"2003","unstructured":"Vuduc R W. Automatic performance tuning of sparse matrix kernels. Dissertation for the Doctoral Degree. Berkeley: University of California, 2003"},{"key":"4127_CR12","volume-title":"Dissertation for the Doctoral Degree","author":"S W Williams","year":"2008","unstructured":"Williams S W. Auto-tuning performance on multicore computers. Dissertation for the Doctoral Degree. Berkeley: University of California, 2008"},{"issue":"3","key":"4127_CR13","doi-asserted-by":"crossref","first-page":"178","DOI":"10.1016\/j.parco.2008.12.006","volume":"35","author":"S Williams","year":"2009","unstructured":"Williams S, Oliker L, Vuduc R, Shalf J, Yelick K, Demmel J. Optimization of sparse matrix-vector multiplication on emerging multicore platforms. Parallel Computing, 2009, 35(3): 178\u2013194","journal-title":"Parallel Computing"},{"issue":"3","key":"4127_CR14","doi-asserted-by":"crossref","first-page":"917","DOI":"10.1145\/882262.882364","volume":"22","author":"J Bolz","year":"2003","unstructured":"Bolz J, Farmer I, Grinspun E, Schr\u00f6oder P. Sparse matrix solvers on the GPU: conjugate gradients and multigrid. ACM Transactions on Graphics, 2003, 22(3): 917\u2013924","journal-title":"ACM Transactions on Graphics"},{"key":"4127_CR15","first-page":"97","volume-title":"Proceedings of Graphics Hardware","author":"S Sengupta","year":"2007","unstructured":"Sengupta S, Harris M, Zhang Y, Owens J D. Scan primitives for GPU computing. In: Proceedings of Graphics Hardware. 2007, 97\u2013106"},{"key":"4127_CR16","volume-title":"Technical Report, NVIDIA Technical Report NVR-2008-004","author":"N Bell","year":"2008","unstructured":"Bell N, Garland M. Efficient Sparse Matrix-vector Multiplication on Cuda. Technical Report, NVIDIA Technical Report NVR-2008-004. 2008"},{"key":"4127_CR17","volume-title":"IBM Reserach Report RC24704 (W0812-047)","author":"M M Baskaran","year":"2008","unstructured":"Baskaran M M, Bordawekar R. Optimizing Sparse Matrix-vector Multiplication on GPUs Using Compile-time and Run-time Strategies. IBM Reserach Report RC24704 (W0812-047). 2008."},{"key":"4127_CR18","first-page":"893","volume-title":"Proceedings of the Computational Science","author":"A Cevahir","year":"2009","unstructured":"Cevahir A, Nukada A, Matsuoka S. Fast conjugate gradients with multiple GPUs. In: Proceedings of the Computational Science. 2009, 893\u2013903."},{"key":"4127_CR19","first-page":"1081","volume-title":"Proceedings of the 2009 International Conference on Computational and Mathematical Methods in Science and Engineering","author":"F V\u00e1zquez","year":"2009","unstructured":"V\u00e1zquez F, Garz\u00f3n E M, Martnez J A, Fern\u00e1ndez J J. The sparse matrix vector product on GPUs. In: Proceedings of the 2009 International Conference on Computational and Mathematical Methods in Science and Engineering. 2009, 1081\u20131092"},{"key":"4127_CR20","doi-asserted-by":"crossref","first-page":"111","DOI":"10.1007\/978-3-642-11515-8_10","volume-title":"Proceedings of the High Performance Embedded Architectures and Compilers","author":"A Monakov","year":"2010","unstructured":"Monakov A, Lokhmotov A, Avetisyan A. Automatically tuning sparse matrix-vector multiplication for GPU architectures. In: Proceedings of the High Performance Embedded Architectures and Compilers. 2010, 111\u2013125"},{"issue":"5","key":"4127_CR21","doi-asserted-by":"crossref","first-page":"115","DOI":"10.1145\/1837853.1693471","volume":"45","author":"J W Choi","year":"2010","unstructured":"Choi J W, Singh A, Vuduc R W. Model-driven autotuning of sparse matrix-vector multiply on GPUs. ACM Sigplan Notices, 2010, 45(5): 115\u2013126","journal-title":"ACM Sigplan Notices"},{"key":"4127_CR22","doi-asserted-by":"crossref","first-page":"1154","DOI":"10.1109\/ICCIS.2010.285","volume-title":"Proceedings of the 2010 International Conference on Computational and Information Sciences (ICCIS)","author":"P Guo","year":"2010","unstructured":"Guo P, Wang L. Auto-tuning cuda parameters for sparse matrixvector multiplication on GPUs. In: Proceedings of the 2010 International Conference on Computational and Information Sciences (ICCIS). 2010, 1154\u20131157"},{"issue":"4","key":"4127_CR23","doi-asserted-by":"crossref","first-page":"231","DOI":"10.14778\/1938545.1938548","volume":"4","author":"X Yang","year":"2011","unstructured":"Yang X, Parthasarathy S, Sadayappan P. Fast sparse matrix-vector multiplication on GPUs: implications for graph mining. Proceedings of the VLDB Endowment, 2011, 4(4): 231\u2013242","journal-title":"Proceedings of the VLDB Endowment"},{"key":"4127_CR24","first-page":"231","volume-title":"Proceedings of the 13th ACIS International Conference on Software Engineering, Artificial Intelligence, Networking and Parallel & Distributed Computing (SNPD)","author":"W Xu","year":"2012","unstructured":"Xu W, Zhang H, Jiao S, Wang D, Song F, Liu Z. Optimizing sparse matrix vector multiplication using cache blocking method on fermi GPU. In: Proceedings of the 13th ACIS International Conference on Software Engineering, Artificial Intelligence, Networking and Parallel & Distributed Computing (SNPD). 2012, 231\u2013235"},{"key":"4127_CR25","doi-asserted-by":"crossref","first-page":"721","DOI":"10.1109\/IPDPS.2011.73","volume-title":"Proceedings of the 2011 IEEE International Parallel & Distributed Processing Symposium","author":"A Buluc","year":"2011","unstructured":"Buluc A, Williams S, Oliker L, Demmel J. Reduced-bandwidth multithreaded algorithms for sparse matrix-vector multiplication. In: Proceedings of the 2011 IEEE International Parallel & Distributed Processing Symposium. 2011, 721\u2013733"},{"key":"4127_CR26","first-page":"233","volume-title":"Proceedings of the 21st annual symposium on Parallelism in algorithms and architectures","author":"A Bulu\u00e7","year":"2009","unstructured":"Bulu\u00e7 A, Fineman J T, Frigo M, Gilbert J R, Leiserson C E. Parallel sparse matrix-vector and matrix-transpose-vector multiplication using compressed sparse blocks. In: Proceedings of the 21st annual symposium on Parallelism in algorithms and architectures. 2009, 233\u2013244"},{"issue":"8","key":"4127_CR27","doi-asserted-by":"crossref","first-page":"247","DOI":"10.1145\/2038037.1941587","volume":"46","author":"K Kourtis","year":"2011","unstructured":"Kourtis K, Karakasis V, Goumas G, Koziris N. Csx: an extended compression format for spmv on shared memory systems. ACM SIGPLAN Notices, 2011, 46(8): 247\u2013256","journal-title":"ACM SIGPLAN Notices"},{"key":"4127_CR28","doi-asserted-by":"crossref","first-page":"307","DOI":"10.1145\/1183401.1183444","volume-title":"Proceedings of the 20th annual international conference on Supercomputing","author":"J Willcock","year":"2006","unstructured":"Willcock J, Lumsdaine A. Accelerating sparse matrix computations via data compression. In: Proceedings of the 20th annual international conference on Supercomputing. 2006, 307\u2013316"},{"issue":"1","key":"4127_CR29","first-page":"71","volume":"6","author":"WZ Xu","year":"2012","unstructured":"Xu WZ, Liu Z Y, Fan D R, Jiao S, Ye X C, Song F L, Yan C. G Accelerating sparse matrix vector multiplication on many-core GPUs.World Academy of Science, Engineering and Technology, 2012, 6(1): 71\u201378","journal-title":"World Academy of Science, Engineering and Technology"}],"container-title":["Frontiers of Computer Science"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11704-014-4127-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11704-014-4127-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11704-014-4127-1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,6,1]],"date-time":"2019-06-01T17:01:20Z","timestamp":1559408480000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11704-014-4127-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014,11,6]]},"references-count":29,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2015,6]]}},"alternative-id":["4127"],"URL":"https:\/\/doi.org\/10.1007\/s11704-014-4127-1","relation":{},"ISSN":["2095-2228","2095-2236"],"issn-type":[{"value":"2095-2228","type":"print"},{"value":"2095-2236","type":"electronic"}],"subject":[],"published":{"date-parts":[[2014,11,6]]}}}