{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T04:23:01Z","timestamp":1742962981319,"version":"3.40.3"},"publisher-location":"Berlin, Heidelberg","reference-count":22,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783642371684"},{"type":"electronic","value":"9783642371691"}],"license":[{"start":{"date-parts":[[2013,1,1]],"date-time":"2013-01-01T00:00:00Z","timestamp":1356998400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2013,1,1]],"date-time":"2013-01-01T00:00:00Z","timestamp":1356998400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2013]]},"DOI":"10.1007\/978-3-642-37169-1_28","type":"book-chapter","created":{"date-parts":[[2013,3,26]],"date-time":"2013-03-26T14:03:46Z","timestamp":1364306626000},"page":"285-294","source":"Crossref","is-referenced-by-count":3,"title":["Superlinear Speedup for Matrix Multiplication in GPU Devices"],"prefix":"10.1007","author":[{"given":"Leonid","family":"Djinevski","sequence":"first","affiliation":[]},{"given":"Sasko","family":"Ristov","sequence":"additional","affiliation":[]},{"given":"Marjan","family":"Gusev","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"28_CR1","first-page":"483","volume-title":"AFIPS Conference Proceedings","author":"G.M. Amdahl","year":"1967","unstructured":"Amdahl, G.M.: Validity of the single-processor approach to achieving large scale computing capabilities. In: AFIPS Conference Proceedings, April 18-20, vol.\u00a030, pp. 483\u2013485. AFIPS Press, Reston (1967)"},{"key":"28_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, E., Bai, Z., Bischof, C., Blackford, S., Demmel, J., Dongarra, J., Croz, J.D., Greenbaum, A., Hammarling, S., McKenney, A., Sorensen, D.: LAPACK Users\u2019 Guide. Soc. for Ind. and Appl. Math., 3rd edn., PA (1999)","DOI":"10.1137\/1.9780898719604"},{"key":"28_CR3","unstructured":"Bell, N., Garland, M.: The impact of cache misses on the performance of matrix product algorithms on multicore platforms. Research Report NVR-2008-004 (December 2008), \n                    http:\/\/hal.inria.fr\/inria-00537822\/en\/"},{"issue":"2","key":"28_CR4","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1145\/567806.567807","volume":"28","author":"L.S. Blackford","year":"2002","unstructured":"Blackford, L.S., et al.: An updated set of basic linear algebra subprograms (blas). ACM Trans. Math. Softw.\u00a028(2), 135\u2013151 (2002)","journal-title":"ACM Trans. Math. Softw."},{"key":"28_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"450","DOI":"10.1007\/978-3-642-29737-3_50","volume-title":"Euro-Par 2011: Parallel Processing Workshops","author":"D. Clarke","year":"2012","unstructured":"Clarke, D., Lastovetsky, A., Rychkov, V.: Column-based matrix partitioning for parallel matrix multiplication on heterogeneous processors based on functional performance models. In: Alexander, M., D\u2019Ambra, P., Belloum, A., Bosilca, G., Cannataro, M., Danelutto, M., Di Martino, B., Gerndt, M., Jeannot, E., Namyst, R., Roman, J., Scott, S.L., Traff, J.L., Vall\u00e9e, G., Weidendorfer, J. (eds.) Euro-Par 2011, Part I. LNCS, vol.\u00a07155, pp. 450\u2013459. Springer, Heidelberg (2012)"},{"key":"28_CR6","volume-title":"HCW 2012","author":"A. DeFlumere","year":"2012","unstructured":"DeFlumere, A., Lastovetsky, A., Becker, B.: Partitioning for parallel matrix-matrix multiplication with heterogeneous processors: The optimal solution. In: HCW 2012. IEEE Computer Society, Shanghai (2012)"},{"key":"28_CR7","unstructured":"Glaskowsky, P.: Nvidias fermi: the first complete gpu computing architecture. Tech. rep., NVIDIA (2009) (white Paper)"},{"key":"28_CR8","unstructured":"Grama, A., Karypis, G., Kumar, V., Gupta, A.: Introduction to Parallel Computing, 2nd edn. Addison-Wesley (January 2003)"},{"key":"28_CR9","doi-asserted-by":"crossref","unstructured":"Gusev, M., Ristov, S.: Superlinear speedup in windows azure cloud. Tech. Rep. IIT:06-12, University Ss Cyril and Methodius, Skopje, Macedonia, Faculty of Information Sciences and Computer Engineering (July 2012)","DOI":"10.1109\/CloudNet.2012.6483679"},{"issue":"5","key":"28_CR10","doi-asserted-by":"publisher","first-page":"532","DOI":"10.1145\/42411.42415","volume":"31","author":"J.L. Gustafson","year":"1988","unstructured":"Gustafson, J.L.: Reevaluating amdahl\u2019s law. ACM\u00a031(5), 532\u2013533 (1988)","journal-title":"ACM"},{"key":"28_CR11","unstructured":"Jacquelin, M., Marchal, L., Robert, Y.: The impact of cache misses on the performance of matrix product algorithms on multicore platforms. Research Report RR-7456, INRIA (November 2010), \n                    http:\/\/hal.inria.fr\/inria-00537822\/en\/"},{"key":"28_CR12","volume-title":"Programming Massively Parallel Processors: A Hands-on Approach","author":"D. Kirk","year":"2010","unstructured":"Kirk, D., Hwu, W.M.: Programming Massively Parallel Processors: A Hands-on Approach, 1st edn. Morgan Kaufmann Publishers Inc., USA (2010)","edition":"1"},{"issue":"2","key":"28_CR13","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1109\/MM.2008.31","volume":"28","author":"E. Lindholm","year":"2008","unstructured":"Lindholm, E., Nickolls, J., Oberman, S., Montrym, J.: Nvidia tesla: A unified graphics and computing architecture. IEEE Micro\u00a028(2), 39\u201355 (2008)","journal-title":"IEEE Micro"},{"issue":"4","key":"28_CR14","doi-asserted-by":"publisher","first-page":"511","DOI":"10.1177\/1094342010385729","volume":"24","author":"R. Nath","year":"2010","unstructured":"Nath, R., Tomov, S., Dongarra, J.: An improved magma gemm for fermi graphics processing units. Int. J. High Perf. C. App.\u00a024(4), 511\u2013515 (2010)","journal-title":"Int. J. High Perf. C. App."},{"issue":"2","key":"28_CR15","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1109\/MM.2010.41","volume":"30","author":"J. Nickolls","year":"2010","unstructured":"Nickolls, J., Dally, W.: The gpu computing era. IEEE Micro\u00a030(2), 56\u201369 (2010)","journal-title":"IEEE Micro"},{"key":"28_CR16","unstructured":"NVIDIA: Cuda programming guide (Auguest 2012), \n                    http:\/\/developer.download.nvidia.com\/compute\/DevZone\/docs\/html\/C\/doc\/CUDA_C_Programming_Guide.pdf\/"},{"key":"28_CR17","unstructured":"NVIDIA: Next generation cuda compute architecture: Kepler gk110 (2012)"},{"issue":"1","key":"28_CR18","doi-asserted-by":"publisher","first-page":"73","DOI":"10.1002\/cpe.1726","volume":"24","author":"D.P. Playne","year":"2012","unstructured":"Playne, D.P., Hawick, K.A.: Comparison of gpu architectures for asynchronous communication with finite-differencing applications. Concurrency and Computation: Practice and Experience\u00a024(1), 73\u201383 (2012)","journal-title":"Concurrency and Computation: Practice and Experience"},{"key":"28_CR19","unstructured":"Ristov, S., Gusev, M.: Superlinear speedup for matrix multiplication. In: Proceedings of the ITI 2012 34th International Conference on Information Technology Interfaces, pp. 499\u2013504 (2012)"},{"key":"28_CR20","doi-asserted-by":"crossref","unstructured":"Ristov, S., Gusev, M., Kostoska, M., Kjiroski, K.: Virtualized environments in cloud can have superlinear speedup. In: ACM Proceedings of 5th Balkan Conference of Informatics, BCI 2012 (2012)","DOI":"10.1145\/2371316.2371319"},{"key":"28_CR21","doi-asserted-by":"crossref","unstructured":"Volkov, V., Demmel, J.W.: Benchmarking gpus to tune dense linear algebra. In: Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing, SC 2008, pp. 31:1\u201331:11. IEEE Press, Piscataway (2008)","DOI":"10.1109\/SC.2008.5214359"},{"issue":"2","key":"28_CR22","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/MM.2011.24","volume":"31","author":"C.M. Wittenbrink","year":"2011","unstructured":"Wittenbrink, C.M., Kilgariff, E., Prabhu, A.: Fermi gf100 gpu architecture. IEEE Micro\u00a031(2), 50\u201359 (2011)","journal-title":"IEEE Micro"}],"container-title":["Advances in Intelligent Systems and Computing","ICT Innovations 2012"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-37169-1_28","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,2,19]],"date-time":"2023-02-19T11:01:41Z","timestamp":1676804501000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-642-37169-1_28"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2013]]},"ISBN":["9783642371684","9783642371691"],"references-count":22,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-37169-1_28","relation":{},"ISSN":["2194-5357","2194-5365"],"issn-type":[{"type":"print","value":"2194-5357"},{"type":"electronic","value":"2194-5365"}],"subject":[],"published":{"date-parts":[[2013]]}}}