{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T21:59:42Z","timestamp":1770069582259,"version":"3.49.0"},"reference-count":28,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2018,1,25]],"date-time":"2018-01-25T00:00:00Z","timestamp":1516838400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"NSFC","doi-asserted-by":"crossref","award":["61572158"],"award-info":[{"award-number":["61572158"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"NSFC","doi-asserted-by":"crossref","award":["61602132"],"award-info":[{"award-number":["61602132"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Shenzhen Science and Technology Program","award":["JSGG20150512145714247"],"award-info":[{"award-number":["JSGG20150512145714247"]}]},{"name":"Shenzhen Science and Technology Program","award":["JCYJ20160330163900579"],"award-info":[{"award-number":["JCYJ20160330163900579"]}]},{"name":"Shenzhen Science and Technology Program","award":["JCYJ20170413105929681"],"award-info":[{"award-number":["JCYJ20170413105929681"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2019,8]]},"DOI":"10.1007\/s00521-018-3354-z","type":"journal-article","created":{"date-parts":[[2018,1,25]],"date-time":"2018-01-25T12:24:56Z","timestamp":1516883096000},"page":"4353-4365","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":23,"title":["CPU versus GPU: which can perform matrix computation faster\u2014performance comparison for basic linear algebra subprograms"],"prefix":"10.1007","volume":"31","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0011-859X","authenticated-orcid":false,"given":"Feng","family":"Li","sequence":"first","affiliation":[]},{"given":"Yunming","family":"Ye","sequence":"additional","affiliation":[]},{"given":"Zhaoyang","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Xiaofeng","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,1,25]]},"reference":[{"issue":"6","key":"3354_CR1","doi-asserted-by":"publisher","first-page":"1311","DOI":"10.1016\/j.patcog.2004.01.013","volume":"37","author":"KS Oh","year":"2004","unstructured":"Oh KS, Jung K (2004) GPU implementation of neural networks. Pattern Recogn 37(6):1311\u20131314","journal-title":"Pattern Recogn"},{"issue":"3\u20134","key":"3354_CR2","doi-asserted-by":"publisher","first-page":"609","DOI":"10.1007\/s00521-013-1408-9","volume":"23","author":"D Baptista","year":"2013","unstructured":"Baptista D, Morgado-Dias F (2013) A survey of artificial neural network training tools. Neural Comput Appl 23(3\u20134):609\u2013615","journal-title":"Neural Comput Appl"},{"issue":"3\u20134","key":"3354_CR3","doi-asserted-by":"publisher","first-page":"591","DOI":"10.1007\/s00521-013-1406-y","volume":"23","author":"D Baptista","year":"2013","unstructured":"Baptista D, Abreu S, Freitas F et al (2013) A survey of software and hardware use in artificial neural networks. Neural Comput Appl 23(3\u20134):591\u2013599","journal-title":"Neural Comput Appl"},{"issue":"3","key":"3354_CR4","first-page":"451","volume":"38","author":"VW Lee","year":"2010","unstructured":"Lee VW, Kim C, Chhugani J et al (2010) Debunking the 100X GPU vs. CPU myth: an evaluation of throughput computing on CPU and GPU. Int Symp Comput Archit 38(3):451\u2013460","journal-title":"Int Symp Comput Archit"},{"issue":"1","key":"3354_CR5","doi-asserted-by":"publisher","first-page":"80","DOI":"10.1111\/j.1467-8659.2007.01012.x","volume":"26","author":"JD Owens","year":"2007","unstructured":"Owens JD, Luebke D, Govindaraju NK et al (2007) A survey of general-purpose computation on graphics hardware. Comput Gr Forum 26(1):80\u2013113","journal-title":"Comput Gr Forum"},{"issue":"1","key":"3354_CR6","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1016\/j.jpdc.2012.04.003","volume":"73","author":"AR Brodtkorb","year":"2013","unstructured":"Brodtkorb AR, Hagen TR, Saetra ML et al (2013) Graphics processing unit (GPU) programming strategies and trends in GPU computing. J Parallel Distrib Comput 73(1):4\u201313","journal-title":"J Parallel Distrib Comput"},{"issue":"3","key":"3354_CR7","doi-asserted-by":"publisher","first-page":"308","DOI":"10.1145\/355841.355847","volume":"5","author":"CL Lawson","year":"1979","unstructured":"Lawson CL, Hanson RJ, Kincaid DR et al (1979) Basic linear algebra subprograms for fortran usage. ACM Trans Math Softw 5(3):308\u2013323","journal-title":"ACM Trans Math Softw"},{"key":"3354_CR8","unstructured":"AMD, AMD Core Math Library (ACML). \n                    http:\/\/developer.amd.com\/acml"},{"key":"3354_CR9","doi-asserted-by":"crossref","unstructured":"Wang E, Zhang Q, Shen B et al (2014) Intel math kernel library. High-Performance Computing on the Intel Xeon Phi. Springer International Publishing, Berlin, pp 167\u2013188","DOI":"10.1007\/978-3-319-06486-4_7"},{"key":"3354_CR10","doi-asserted-by":"crossref","unstructured":"Barrachina S, Castillo M, Igual FD et al (2008) Evaluation and tuning of the level 3 CUBLAS for graphics processors. In: IEEE international symposium on parallel and distributed processing, pp 1\u20138","DOI":"10.1109\/IPDPS.2008.4536485"},{"key":"3354_CR11","doi-asserted-by":"publisher","DOI":"10.1137\/1.9780898719604","volume-title":"LAPACK users\u2019 guide","author":"E Anderson","year":"1999","unstructured":"Anderson E, Bai Z, Bischof C et al (1999) LAPACK users\u2019 guide. Society for Industrial and Applied Mathematics, Philadelphia, PA"},{"key":"3354_CR12","unstructured":"Moler C (2000) Matlab incorporates LAPACK. Increasing the speed and capabilities of matrix computation, MATLAB News and NotesCWinter"},{"issue":"2","key":"3354_CR13","doi-asserted-by":"publisher","first-page":"22","DOI":"10.1109\/MCSE.2011.37","volume":"13","author":"S Walt","year":"2011","unstructured":"Walt S, Colbert SC, Varoquaux G (2011) The NumPy array: a structure for efficient numerical computation. Comput Sci Eng 13(2):22\u201330","journal-title":"Comput Sci Eng"},{"key":"3354_CR14","first-page":"368","volume-title":"Joint weighted nonnegative matrix factorization for mining attributed graphs. Pacific-Asia conference on knowledge discovery and data mining","author":"Z Huang","year":"2017","unstructured":"Huang Z, Ye Y, Li X et al (2017) Joint weighted nonnegative matrix factorization for mining attributed graphs. Pacific-Asia conference on knowledge discovery and data mining. Springer, Cham, pp 368\u2013380"},{"issue":"6","key":"3354_CR15","doi-asserted-by":"publisher","first-page":"1625","DOI":"10.1109\/TSMCC.2012.2227112","volume":"43","author":"H Zhang","year":"2013","unstructured":"Zhang H, Ho JKL, Wu QMJ et al (2013) Multidimensional latent semantic analysis using term spatial information. IEEE Trans Cybern 43(6):1625\u20131640","journal-title":"IEEE Trans Cybern"},{"key":"3354_CR16","unstructured":"Abadi M, Agarwal A, Barham P et al (2016) Tensorflow: large-scale machine learning on heterogeneous distributed systems"},{"key":"3354_CR17","doi-asserted-by":"crossref","unstructured":"Jia Y, Shelhamer E, Donahue J et al (2014) Caffe: convolutional architecture for fast feature embedding. In: Proceedings of the 22nd ACM international conference on Multimedia, pp 675\u2013678","DOI":"10.1145\/2647868.2654889"},{"issue":"2","key":"3354_CR18","doi-asserted-by":"publisher","first-page":"616","DOI":"10.1109\/TII.2016.2601521","volume":"13","author":"H Zhang","year":"2017","unstructured":"Zhang H, Li J, Ji Y et al (2017) Understanding subtitles by character-level sequence-to-sequence learning. IEEE Trans Industr Inf 13(2):616\u2013624","journal-title":"IEEE Trans Industr Inf"},{"key":"3354_CR19","unstructured":"Uzair M, Shafait F, Ghanem B et al (2015) Representation learning with deep extreme learning machines for efficient image set classification. Neural Comput Appl, pp 1\u201313"},{"issue":"2","key":"3354_CR20","doi-asserted-by":"publisher","first-page":"520","DOI":"10.1109\/TII.2016.2605629","volume":"13","author":"H Zhang","year":"2017","unstructured":"Zhang H, Cao X, Ho JKL et al (2017) Object-level video advertising: an optimization framework. IEEE Trans Industr Inf 13(2):520\u2013531","journal-title":"IEEE Trans Industr Inf"},{"key":"3354_CR21","doi-asserted-by":"crossref","unstructured":"Guo H, Tang R, Ye Y et al (2017) DeepFM: a factorization-machine based neural network for CTR prediction. In: The twenty-sixth international joint conference on artificial intelligence (IJCAI), pp 1725\u20131731","DOI":"10.24963\/ijcai.2017\/239"},{"issue":"1","key":"3354_CR22","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/42288.42291","volume":"14","author":"J Dongarra","year":"1988","unstructured":"Dongarra J, DuCroz J, Hammarling S et al (1988) An extended set of FORTRAN basic linear algebra subprograms. ACM Trans Math Softw 14(1):1\u201317","journal-title":"ACM Trans Math Softw"},{"issue":"1","key":"3354_CR23","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/77626.79170","volume":"16","author":"J Dongarra","year":"1990","unstructured":"Dongarra J, DuCroz J, Hammarling S et al (1990) A set of level 3 basic linear algebra subprograms. ACM Trans Math Softw 16(1):1\u201317","journal-title":"ACM Trans Math Softw"},{"key":"3354_CR24","doi-asserted-by":"crossref","unstructured":"Mukunoki D, Imamura T, Takahashi D (2015) Fast implementation of general matrix\u2013vector multiplication (GEMV) on Kepler GPUs. In: 23rd Euromicro international conference on parallel, distributed and network-based processing (PDP), IEEE,, pp 642\u2013650","DOI":"10.1109\/PDP.2015.66"},{"key":"3354_CR25","unstructured":"Danihelka I, Wayne G, Uria B et al (2016) Associative long short-term memory. arXiv preprint \n                    arXiv:1602.03032"},{"issue":"4","key":"3354_CR26","doi-asserted-by":"publisher","first-page":"511","DOI":"10.1177\/1094342010385729","volume":"24","author":"R Nath","year":"2010","unstructured":"Nath R, Tomov S, Dongarra J (2010) An improved MAGMA GEMM for Fermi graphics processing units. Int J High Perform Comput Appl 24(4):511\u2013515","journal-title":"Int J High Perform Comput Appl"},{"issue":"4","key":"3354_CR27","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1145\/1964218.1964227","volume":"38","author":"N Nakasato","year":"2011","unstructured":"Nakasato N (2011) A fast GEMM implementation on the Cypress GPU. ACM SIGMETRICS Perform Eval Rev 38(4):50\u201355","journal-title":"ACM SIGMETRICS Perform Eval Rev"},{"issue":"1","key":"3354_CR28","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1016\/0167-8191(88)90009-9","volume":"6","author":"CH Romine","year":"1988","unstructured":"Romine CH, Ortega JM (1988) Parallel solution of triangular systems of equations. Parallel Comput 6(1):109\u2013114","journal-title":"Parallel Comput"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s00521-018-3354-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-018-3354-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-018-3354-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,9,6]],"date-time":"2019-09-06T14:33:27Z","timestamp":1567780407000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s00521-018-3354-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,1,25]]},"references-count":28,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2019,8]]}},"alternative-id":["3354"],"URL":"https:\/\/doi.org\/10.1007\/s00521-018-3354-z","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"value":"0941-0643","type":"print"},{"value":"1433-3058","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,1,25]]},"assertion":[{"value":"8 November 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 January 2018","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 January 2018","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance with ethical standards"}},{"value":"No conflict of interest exits in the submission of this manuscript.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}