{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T00:37:08Z","timestamp":1767141428881,"version":"build-2238731810"},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2015,9,22]],"date-time":"2015-09-22T00:00:00Z","timestamp":1442880000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2016,8]]},"DOI":"10.1007\/s10766-015-0378-1","type":"journal-article","created":{"date-parts":[[2015,9,22]],"date-time":"2015-09-22T05:31:24Z","timestamp":1442899884000},"page":"801-830","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Optimizing the Matrix Multiplication Using Strassen and Winograd Algorithms with Limited Recursions on Many-Core"],"prefix":"10.1007","volume":"44","author":[{"given":"Ayaz ul Hassan","family":"Khan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mayez","family":"Al-Mouhamed","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Allam","family":"Fatayer","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nazeeruddin","family":"Mohammad","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2015,9,22]]},"reference":[{"key":"378_CR1","unstructured":"Al-Mouhamed, M., ul\u00a0Hassan\u00a0Khan, A.: Exploration of automatic optimization for CUDA programming. Int. J. Parallel Emerg. Distrib. Syst. pp. 1\u201316 (2014)"},{"key":"378_CR2","doi-asserted-by":"crossref","unstructured":"Badin, M., D\u2019Alberto, P., Bic, L., Dillencourt, M., Nicolau, A.: Improving numerical accuracy for non-negative matrix multiplication on GPUs using recursive algorithms. In: Proceedings of the 27th International ACM Conference on Supercomputing, pp. 213\u2013222. ACM (2013)","DOI":"10.1145\/2464996.2465010"},{"issue":"3","key":"378_CR3","doi-asserted-by":"crossref","first-page":"603","DOI":"10.1137\/0909040","volume":"9","author":"DH Bailey","year":"1988","unstructured":"Bailey, D.H.: Extra high speed matrix multiplication on the cray-2. SIAM J. Sci. Stat. Comput. 9(3), 603\u2013607 (1988)","journal-title":"SIAM J. Sci. Stat. Comput."},{"key":"378_CR4","doi-asserted-by":"crossref","unstructured":"Ballard, G., Demmel, J., Holtz, O., Lipshitz, B., Schwartz, O.: Communication-optimal parallel algorithm for Strassen\u2019s matrix multiplication. In: Proceedings of the 24th ACM Symposium on Parallelism in Algorithms and Architectures, SPAA \u201912, pp. 193\u2013204. ACM (2012)","DOI":"10.1145\/2312005.2312044"},{"issue":"2","key":"378_CR5","doi-asserted-by":"crossref","first-page":"107","DOI":"10.1145\/2556647.2556660","volume":"57","author":"G Ballard","year":"2014","unstructured":"Ballard, G., Demmel, J., Holtz, O., Schwartz, O.: Communication costs of Strassen\u2019s matrix multiplication. Commun. ACM 57(2), 107\u2013114 (2014)","journal-title":"Commun. ACM"},{"issue":"2","key":"378_CR6","doi-asserted-by":"crossref","first-page":"327","DOI":"10.1007\/s10586-013-0279-2","volume":"17","author":"C Chen","year":"2014","unstructured":"Chen, C., Taha, T.: A communication reduction approach to iteratively solve large sparse linear systems on a GPGPU cluster. Cluster Comput. 17(2), 327\u2013337 (2014)","journal-title":"Cluster Comput."},{"key":"378_CR7","doi-asserted-by":"crossref","unstructured":"Coppersmith, D., Winograd, S.: Matrix multiplication via arithmetic progressions. In: Proceedings of the Nineteenth Annual ACM Symposium on Theory of Computing, STOC \u201987, pp. 1\u20136. ACM (1987)","DOI":"10.1145\/28395.28396"},{"issue":"2","key":"378_CR8","doi-asserted-by":"crossref","first-page":"243","DOI":"10.1007\/s10586-013-0329-9","volume":"17","author":"S Costarelli","year":"2014","unstructured":"Costarelli, S., Storti, M., Paz, R., Dalcin, L., Idelsohn, S.: GPGPU implementation of the BFECC algorithm for pure advection equations. Cluster Comput. 17(2), 243\u2013254 (2014)","journal-title":"Cluster Comput."},{"key":"378_CR9","doi-asserted-by":"crossref","unstructured":"Cui, X., Chen, Y., Zhang, C., Mei, H.: Auto-tuning dense matrix multiplication for GPGPU with cache. In: IEEE 16th International Conference on Parallel and Distributed Systems (ICPADS), pp. 237\u2013242 (2010)","DOI":"10.1109\/ICPADS.2010.64"},{"issue":"1\u20132","key":"378_CR10","doi-asserted-by":"crossref","first-page":"53","DOI":"10.1080\/10637199408915454","volume":"4","author":"B Dumitrescu","year":"1994","unstructured":"Dumitrescu, B., Roch, J.L., Trystram, D.: Fast matrix multiplication algorithms on MIMD architectures. Parallel Algorithms Appl. 4(1\u20132), 53\u201370 (1994)","journal-title":"Parallel Algorithms Appl."},{"key":"378_CR11","doi-asserted-by":"crossref","unstructured":"Heinecke, A., Vaidyanathan, K., Smelyanskiy, M., Kobotov, A., Dubtsov, R., Henry, G., Shet, A.G., Chrysos, G., Dubey, P.: Design and implementation of the Linpack benchmark for single and multi-node systems based on Intel Xeon Phi coprocessor. In: International Symposium on Parallel and Distributed Processing, pp. 126\u2013137 (2013)","DOI":"10.1109\/IPDPS.2013.113"},{"key":"378_CR12","unstructured":"Intel Corporation: Intel Knights Corner: Software Developer Guide (2012)"},{"key":"378_CR13","unstructured":"Intel Corporation: Intel Xeon Phi: Coprocessor Instruction Set Architecture, Reference Manual (2012)"},{"issue":"8","key":"378_CR14","doi-asserted-by":"crossref","first-page":"687","DOI":"10.1002\/(SICI)1099-1506(199912)6:8<687::AID-NLA177>3.0.CO;2-I","volume":"6","author":"I Kaporin","year":"1999","unstructured":"Kaporin, I.: A practical algorithm for faster matrix multiplication. Numer. Linear Algebra Appl. 6(8), 687\u2013700 (1999)","journal-title":"Numer. Linear Algebra Appl."},{"key":"378_CR15","unstructured":"Kirk, D.B., Hwu, W.m.W.: Programming Massively Parallel Processors: A Hands-on Approach, 1st edn. Morgan Kaufmann Pub. (2010)"},{"key":"378_CR16","unstructured":"Kurzak, J., Tomov, S., Dongarra, J.: Autotuning GEMMs for Fermi. Tech. Rep. 245, LAPACK Working Note (2011)"},{"key":"378_CR17","doi-asserted-by":"crossref","unstructured":"Lai, P.W., Arafat, H., Elango, V., Sadayappan, P.: Accelerating Strassen-Winograd\u2019s matrix multiplication algorithm on GPUs. In: 20th International Conference on High Performance Computing (HiPC), 2013, pp. 139\u2013148 (2013)","DOI":"10.1109\/HiPC.2013.6799109"},{"issue":"2","key":"378_CR18","doi-asserted-by":"crossref","first-page":"384","DOI":"10.1007\/s10766-013-0252-y","volume":"42","author":"C Lee","year":"2014","unstructured":"Lee, C., Ro, W., Gaudiot, J.L.: Boosting CUDA applications with CPUGPU hybrid computing. Int. J. Parallel Program. 42(2), 384\u2013404 (2014)","journal-title":"Int. J. Parallel Program."},{"key":"378_CR19","doi-asserted-by":"crossref","unstructured":"Li, J., Ranka, S., Sahni, S.: Strassen\u2019s Matrix Multiplication on GPUs. In: Proceedings of the 2011 IEEE 17th International Conference on Parallel and Distributed Systems, ICPADS \u201911, pp. 157\u2013164 (2011)","DOI":"10.1109\/ICPADS.2011.130"},{"key":"378_CR20","doi-asserted-by":"crossref","unstructured":"Li, Y., Dongarra, J., Tomov, S.: A Note on Auto-tuning GEMM for GPUs. In: Proceedings of the 9th International Conference on Computational Science, ICCS \u201909, pp. 884\u2013892 (2009)","DOI":"10.1007\/978-3-642-01970-8_89"},{"key":"378_CR21","doi-asserted-by":"crossref","unstructured":"Lipshitz, B., Ballard, G., Demmel, J., Schwartz, O.: Communication-avoiding parallel Strassen: Implementation and performance. In: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, SC \u201912, pp. 101:1\u201311 (2012)","DOI":"10.1109\/SC.2012.33"},{"issue":"4","key":"378_CR22","doi-asserted-by":"crossref","first-page":"511","DOI":"10.1177\/1094342010385729","volume":"24","author":"R Nath","year":"2010","unstructured":"Nath, R., Tomov, S., Dongarra, J.: An improved MAGMA GEMM for fermi graphics. Int. J. High Perform. Comput. Appl. 24(4), 511\u2013515 (2010)","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"378_CR23","unstructured":"NVIDIA: CUBLAS (2013). https:\/\/developer.nvidia.com\/cuBLAS"},{"key":"378_CR24","doi-asserted-by":"crossref","unstructured":"Pan, V.Y.: How to Multiply Matrices Faster. Lecture Notes in Computer Science. vol. 179. Springer (1984)","DOI":"10.1007\/3-540-13866-8"},{"key":"378_CR25","volume-title":"An Overview of Programming for Intel Xeon processors and Intel Xeon Phi coprocessors","author":"J Reinders","year":"2012","unstructured":"Reinders, J.: An Overview of Programming for Intel Xeon processors and Intel Xeon Phi coprocessors. Intel Corporation, Santa Clara (2012)"},{"issue":"9","key":"378_CR26","first-page":"1","volume":"38","author":"S Robinson","year":"2005","unstructured":"Robinson, S.: Toward an optimal algorithm for matrix multiplication. SIAM News 38(9), 1\u20133 (2005)","journal-title":"SIAM News"},{"issue":"4","key":"378_CR27","doi-asserted-by":"crossref","first-page":"354","DOI":"10.1007\/BF02165411","volume":"13","author":"V Strassen","year":"1969","unstructured":"Strassen, V.: Gaussian elimination is not optimal. Numerische Mathematik 13(4), 354\u2013356 (1969)","journal-title":"Numerische Mathematik"},{"key":"378_CR28","doi-asserted-by":"crossref","unstructured":"Volkov, V., Demmel, J.W.: Benchmarking GPUs to tune dense linear algebra. In: Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing, SC \u201908, pp. 31:1\u201311 (2008)","DOI":"10.1109\/SC.2008.5214359"},{"key":"378_CR29","doi-asserted-by":"crossref","unstructured":"Wei, S.C., Huang, B.: Accelerating volkov\u2019s hybrid implementation of cholesky factorization on a fermi gpu. In: Parallel and Distributed Systems (ICPADS), 2012 IEEE 18th International Conference on, pp. 896\u2013900. IEEE (2012)","DOI":"10.1109\/ICPADS.2012.147"},{"key":"378_CR30","unstructured":"Williams, V.: Multiplying matrices in $$o(n^{2.373})$$ o ( n 2.373 ) time. Stanford University (2014). http:\/\/theory.stanford.edu\/~virgi\/matrixmult-f.pdf"},{"key":"378_CR31","unstructured":"Winograd, S.: Some remarks on fast multiplication of polynomials. Complexity of Sequential and Parallel Numerical Algorithms p. 181 (1973)"},{"issue":"6","key":"378_CR32","doi-asserted-by":"crossref","first-page":"768","DOI":"10.1007\/s10766-012-0228-3","volume":"41","author":"Y Yang","year":"2013","unstructured":"Yang, Y., Zhou, H.: The implementation of a high performance GPGPU compiler. Int. J. Parallel Program. 41(6), 768\u2013781 (2013)","journal-title":"Int. J. Parallel Program."}],"updated-by":[{"DOI":"10.1007\/s10766-015-0397-y","type":"correction","label":"Correction","source":"publisher","updated":{"date-parts":[[2015,11,23]],"date-time":"2015-11-23T00:00:00Z","timestamp":1448236800000}}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-015-0378-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10766-015-0378-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-015-0378-1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,5,30]],"date-time":"2019-05-30T20:02:33Z","timestamp":1559246553000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10766-015-0378-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,9,22]]},"references-count":32,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2016,8]]}},"alternative-id":["378"],"URL":"https:\/\/doi.org\/10.1007\/s10766-015-0378-1","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"value":"0885-7458","type":"print"},{"value":"1573-7640","type":"electronic"}],"subject":[],"published":{"date-parts":[[2015,9,22]]}}}