{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2022,4,3]],"date-time":"2022-04-03T21:59:02Z","timestamp":1649023142197},"reference-count":24,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2014,12,4]],"date-time":"2014-12-04T00:00:00Z","timestamp":1417651200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Analog Integr Circ Sig Process"],"published-print":{"date-parts":[[2015,1]]},"DOI":"10.1007\/s10470-014-0441-7","type":"journal-article","created":{"date-parts":[[2014,12,3]],"date-time":"2014-12-03T12:08:50Z","timestamp":1417608530000},"page":"147-158","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Algorithm-oriented design of efficient many-core architectures applied to dense matrix multiplication"],"prefix":"10.1007","volume":"82","author":[{"given":"Wilson M.","family":"Jos\u00e9","sequence":"first","affiliation":[]},{"given":"Ana Rita","family":"Silva","sequence":"additional","affiliation":[]},{"given":"M\u00e1rio P.","family":"V\u00e9stias","sequence":"additional","affiliation":[]},{"given":"Hor\u00e1cio C.","family":"Neto","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2014,12,4]]},"reference":[{"key":"441_CR1","unstructured":"Hofstee, H.P. (2005). Power efficient processor architecture and the cell processor. In HPCA\u201905: Proceedings of the 11th International Symposium on High-Performance Computer Architecture, Washington: IEEE Computer Society."},{"issue":"1","key":"441_CR2","doi-asserted-by":"crossref","first-page":"29","DOI":"10.1109\/JSSC.2007.910957","volume":"43","author":"S Vangal","year":"2008","unstructured":"Vangal, S., Howard, J., Ruhl, G., Dighe, S., Wilson, H., Tschanz, J., et al. (2008). An 80-tile sub-100w teraflops processor in 65-nm CMOS. IEEE Journal of Solid-State Circuits, 43(1), 29\u201341.","journal-title":"IEEE Journal of Solid-State Circuits"},{"key":"441_CR3","unstructured":"CSX700 Floating Point Processor. Datasheet 06-PD-1425 Rev 1, ClearSpeed Technology Ltd (2011)."},{"key":"441_CR4","unstructured":"http:\/\/www.intel.com\/content\/www\/us\/en\/high-performance-computing\/high-performance-xeon-phi-coprocessor-brief.html"},{"key":"441_CR5","unstructured":"http:\/\/www.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/performance-briefs\/xeon-phi-product-family-performance-brief"},{"issue":"3","key":"441_CR6","doi-asserted-by":"crossref","first-page":"12","DOI":"10.1145\/1356052.1356053","volume":"34","author":"K Goto","year":"2008","unstructured":"Goto, K., & Geijn, R. (2008). Anatomy of a high-performance matrix multiplication. ACM Transactions on Mathematical Software , 34(3), 12.","journal-title":"ACM Transactions on Mathematical Software"},{"key":"441_CR7","unstructured":"Liao, T.G.S.(2002). System design with SystemC.Norwell: Kluwer Academic Publishers"},{"issue":"1","key":"441_CR8","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1377603.1377607","volume":"35","author":"K Goto","year":"2008","unstructured":"Goto, K., & Geijn, R. (2008). High performance implementation of the level-3 BLAS. ACM Transactions on Mathematical Software , 35(1), 1\u201314.","journal-title":"ACM Transactions on Mathematical Software"},{"issue":"12","key":"441_CR9","doi-asserted-by":"crossref","first-page":"1724","DOI":"10.1109\/TC.2012.132","volume":"61","author":"A Pedram","year":"2012","unstructured":"Pedram, A., van de Geijn, R., & Gerstlauer, A. (2012). Codesign tradeoffs for high-performance, low-power linear algebra architectures. IEEE Transactions on Computers, 61(12), 1724\u20131736.","journal-title":"IEEE Transactions on Computers"},{"key":"441_CR10","unstructured":"Matam, K., Le, H., & Prasanna, V. (2013). Energy efficient architecture for matrix multiplication on FPGAs, FPL. IEEE Transactions on Very Large Scale Integration (VLSI) Systems, 13(11), 1305\u20131319"},{"key":"441_CR11","doi-asserted-by":"crossref","unstructured":"Allada, V., et al. (2009). Performance analysis of memory transfers and GEMM subroutines on NVIDIA Tesla GPU Cluster. Proceedings of IEEE International Conference Cluster Computing and Workshops, pp. 1\u20139.","DOI":"10.1109\/CLUSTR.2009.5289124"},{"key":"441_CR12","doi-asserted-by":"crossref","unstructured":"Volkov, V., et al. (2008). Benchmarking GPUs to tune dense linear algebra. Proceedings ACM\/IEEE Conference on Supercomputing, pp. 1\u201311.","DOI":"10.1109\/SC.2008.5214359"},{"key":"441_CR13","doi-asserted-by":"crossref","unstructured":"Tan, G., et al. (2011). Fast implementation of DGEMM on Fermi GPU. Proceedings of International Conference for High Performance Computing, Networking, Storage and Analysis.","DOI":"10.1145\/2063384.2063431"},{"key":"441_CR14","unstructured":"Wang, Z., Tan, H., & Ranka, S. (2012). Energy and performance tradeoffs for matrix multiplication on multicore machines. International Conference on Green Computing, pp. 1,6, 4\u20138 June."},{"key":"441_CR15","doi-asserted-by":"crossref","unstructured":"Dou, Y., Vassiliadis, S., Kuzmanov, G., & Gaydadjiev, G. (2005). 64-bit floating-point FPGA matrix multiplication. In ACM\/SIGMA 13th International Symposium on Field-Programmable Gate Arrays, pp. 86\u201395.","DOI":"10.1145\/1046192.1046204"},{"issue":"8","key":"441_CR16","doi-asserted-by":"crossref","first-page":"1057","DOI":"10.1109\/TC.2008.55","volume":"57","author":"L Zhuo","year":"2008","unstructured":"Zhuo, L., & Prasanna, Viktor K. (2008). High-performance designs for linear algebra operations on reconfigurable hardware. IEEE Transactions on Computers, 57(8), 1057\u20131071.","journal-title":"IEEE Transactions on Computers"},{"key":"441_CR17","doi-asserted-by":"crossref","unstructured":"Lin, C., So, H., & Leong, P. (2011). A model for matrix multiplication performance on FPGAs. In International Conference on Field Programmable Logic and Applications, pp. 305\u2013310.","DOI":"10.1109\/FPL.2011.62"},{"key":"441_CR18","volume-title":"Scientific computing - an introduction with parallel computing","author":"G Golub","year":"1993","unstructured":"Golub, G., & Ortega, J. (1993). Scientific computing - an introduction with parallel computing. San Diego: Academic Press Inc."},{"key":"441_CR19","unstructured":"http:\/\/www.techpowerup.com\/gpudb\/2482\/xeon-phi-5120d.html"},{"key":"441_CR20","unstructured":"http:\/\/ark.intel.com\/products\/37151\/"},{"key":"441_CR21","unstructured":"Kanter, D. (Sept. 2009). Inside Fermi: Nvidia\u2019s HPC Push. Technical Report, Real World Technologies."},{"key":"441_CR22","unstructured":"http:\/\/www.anandtech.com\/show\/6774\/nvidias-geforce-gtx-titan-part-2-titans-performance-unveiled\/3"},{"key":"441_CR23","unstructured":"http:\/\/www.tomshardware.com\/reviews\/geforce-gtx-titan-gk110-review,3438.html"},{"key":"441_CR24","unstructured":"Ware, M., et al. (2010). Architecting for power management: The IBM POWER7 approach. Proceedings of IEEE 16th International Symposium on High Performance Computer Architecture, pp. 1\u201311."}],"container-title":["Analog Integrated Circuits and Signal Processing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10470-014-0441-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10470-014-0441-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10470-014-0441-7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,8,18]],"date-time":"2019-08-18T03:26:28Z","timestamp":1566098788000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10470-014-0441-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014,12,4]]},"references-count":24,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2015,1]]}},"alternative-id":["441"],"URL":"https:\/\/doi.org\/10.1007\/s10470-014-0441-7","relation":{},"ISSN":["0925-1030","1573-1979"],"issn-type":[{"value":"0925-1030","type":"print"},{"value":"1573-1979","type":"electronic"}],"subject":[],"published":{"date-parts":[[2014,12,4]]}}}