{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T04:38:39Z","timestamp":1774931919133,"version":"3.50.1"},"reference-count":23,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2015,7,22]],"date-time":"2015-07-22T00:00:00Z","timestamp":1437523200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001823","name":"Ministry of Education, Youth and Sports (CZ)","doi-asserted-by":"publisher","award":["ED3.2.00\/08.0144"],"award-info":[{"award-number":["ED3.2.00\/08.0144"]}],"id":[{"id":"10.13039\/501100001823","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001823","name":"Ministry of Education, Youth and Sports (CZ)","doi-asserted-by":"publisher","award":["CZ.1.07\/2.3.00\/30.0037"],"award-info":[{"award-number":["CZ.1.07\/2.3.00\/30.0037"]}],"id":[{"id":"10.13039\/501100001823","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2015,10]]},"DOI":"10.1007\/s11227-015-1483-z","type":"journal-article","created":{"date-parts":[[2015,7,21]],"date-time":"2015-07-21T10:01:35Z","timestamp":1437472895000},"page":"3934-3957","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":58,"title":["Optimizing CUDA code by kernel fusion: application on BLAS"],"prefix":"10.1007","volume":"71","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5703-9673","authenticated-orcid":false,"given":"Ji\u0159\u00ed","family":"Filipovi\u010d","sequence":"first","affiliation":[]},{"given":"Mat\u00fa\u0161","family":"Madzin","sequence":"additional","affiliation":[]},{"given":"Jan","family":"Fousek","sequence":"additional","affiliation":[]},{"given":"Lud\u011bk","family":"Matyska","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2015,7,22]]},"reference":[{"key":"1483_CR1","doi-asserted-by":"crossref","unstructured":"Belter G, Jessup ER, Karlin I, Siek JG (2009) Automating the generation of composed linear algebra kernels. In: Proceedings of the conference on high performance computing, networking, storage and analysis (SC09), ACM, 2009, pp 1\u201312","DOI":"10.1145\/1654059.1654119"},{"key":"1483_CR2","doi-asserted-by":"crossref","unstructured":"Blackford LS, Demmel J, Dongarra J, Duff I, Hammarling S, Henry G, Heroux M, Kaufman L, Lumsdaine A, Petitet A, Pozo R, Remington K, Whaley RC (2002) An updated set of basic linear algebra subprograms (BLAS). ACM Trans Math Softw 28:135\u2013151","DOI":"10.1145\/567806.567807"},{"key":"1483_CR3","doi-asserted-by":"crossref","unstructured":"Catanzaro B, Garland M, Keutzer K (2011) Copperhead: compiling an embedded data parallel language In: The 16th ACM symposium on principles and practice of parallel programming (PPoPP)","DOI":"10.1145\/1941553.1941562"},{"key":"1483_CR4","unstructured":"Cole M (1989) Algorithmic skeletons: structural management of parallel computation. Research monographs in parallel and distributed computing. MIT Press, Cambridge"},{"key":"1483_CR5","doi-asserted-by":"crossref","unstructured":"Dehnavi MM, Fernandez DM, Giannacopoulos D (2011) Enhancing the performance of conjugate gradient solvers on graphic processing units. IEEE Trans Magn 47:1162\u20131165","DOI":"10.1109\/TMAG.2010.2081662"},{"key":"1483_CR6","doi-asserted-by":"crossref","unstructured":"Filipovi\u010d J, Fousek J, Lakom\u00fd B, Madzin M (2012) Automatically optimized GPU acceleration of element subroutines in finite element method. In: Symposium on application accelerators in high-performance computing (SAAHPC)","DOI":"10.1109\/SAAHPC.2012.23"},{"key":"1483_CR7","doi-asserted-by":"crossref","unstructured":"Fousek J, Filipovi\u010d J, Madzin M (2011) Automatic fusions of CUDA-GPU kernels for parallel map. In: Second international workshop on highly-efficient accelerators and reconfigurable technologies (HEART), pp 42\u201347","DOI":"10.1145\/2082156.2082183"},{"key":"1483_CR8","doi-asserted-by":"crossref","unstructured":"Gonz\u00e1lez-V\u00e9lez H, Leyton M (2010) A survey of algorithmic skeleton frameworks: high-level structured parallel programming enablers. Softw Pract Exp 40:1135\u20131160","DOI":"10.1002\/spe.1026"},{"key":"1483_CR9","unstructured":"Gulati K, Khatri SP (2009) An automated approach for simd kernel generation for GPU based software acceleration. In: Symposium on application accelerators in high performance computing (SAAHPC)"},{"key":"1483_CR10","doi-asserted-by":"crossref","unstructured":"Gupta K, Stuart JA, Owens JD (2012) A study of persistent threads style GPU programming for GPGPU workloads. In: Innovative parallel computing","DOI":"10.1109\/InPar.2012.6339596"},{"key":"1483_CR11","unstructured":"Hoberock J, Bell N (2009) Thrust: a parallel template library"},{"key":"1483_CR12","doi-asserted-by":"crossref","unstructured":"Howell GW, Demmel JW, Fulton CT, Hammarling S, Marmol K (2008) Cache efficient bidiagonalization using BLAS 2.5 operators. ACM Trans Math Softw (TOMS) 34:1\u201314","DOI":"10.1145\/1356052.1356055"},{"key":"1483_CR13","doi-asserted-by":"crossref","unstructured":"Iverson KE (1962) A programming language. In: Spring joint computer conference (AIEE-IRE)","DOI":"10.1145\/1460833.1460872"},{"key":"1483_CR14","doi-asserted-by":"crossref","unstructured":"Larsen B (2011) Simple optimizations for an applicative array language for graphics processors. In: Proceedings of the sixth workshop on Declarative aspects of multicore programming (DAMP), 2011","DOI":"10.1145\/1926354.1926360"},{"key":"1483_CR15","doi-asserted-by":"crossref","unstructured":"Meng J, Morozov VA, Kumaran K, Vishwanath V, Uram TD (2011) Grophecy: GPU performance projection from CPU code skeletons. In: International conference for high performance computing, networking, storage and analysis (SC11)","DOI":"10.1145\/2063384.2063402"},{"key":"1483_CR16","doi-asserted-by":"crossref","unstructured":"Meng J, Morozov VA, Vishwanath V, Kumaran K (2012) Dataflow-driven gpu performance projection for multi-kernel transformations. In: International conference for high performance computing, networking, storage and analysis (SC12)","DOI":"10.1109\/SC.2012.42"},{"key":"1483_CR17","unstructured":"NVIDIA, CUDA C Programming Guide, version 6.5., (2014)"},{"key":"1483_CR18","doi-asserted-by":"crossref","unstructured":"Russell FP, Mellor MR, Kelly PH, Beckmann O (2011) DESOLA: an active linear algebra library using delayed evaluation and runtime code generation. Sci Comput Program 76:227\u2013242","DOI":"10.1016\/j.scico.2008.06.002"},{"key":"1483_CR19","doi-asserted-by":"crossref","unstructured":"Sato S, Iwasaki H (2009) A skeletal parallel framework with fusion optimizer for GPGPU programming. In: Programming languages and systems, vol 5904 of Lecture Notes in Computer Science. Springer Berlin","DOI":"10.1007\/978-3-642-10672-9_8"},{"key":"1483_CR20","doi-asserted-by":"crossref","unstructured":"Tabik S, Ortega G, Garz\u00f3n EM (2014) Performance evaluation of kernel fusion blas routines on the GPU: iterative solvers as case study. J Supercomput 70:577\u2013587","DOI":"10.1007\/s11227-014-1102-4"},{"key":"1483_CR21","doi-asserted-by":"crossref","unstructured":"Tarditi D, Puri S, Oglesby J (2006) Accelerator: using data parallelism to program GPUs for general-purpose uses, SIGARCH Computer Architecture News, 34","DOI":"10.1145\/1168919.1168898"},{"key":"1483_CR22","doi-asserted-by":"crossref","unstructured":"Wahib M, Marutama N (2014) Scalable kernel fusion for memory-bound GPU applications. In: International conference for high performance computing, networking, storage and analysis (SC14)","DOI":"10.1109\/SC.2014.21"},{"key":"1483_CR23","doi-asserted-by":"crossref","unstructured":"Wang G, Lin Y, Yi W (2010) Kernel fusion: an effective method for better power efficiency on multithreaded GPU. In: IEEE\/ACM international conference on green computing and communications and international conference on cyber, physical and social computing (GREENCOM\u2013CPSCOM)","DOI":"10.1109\/GreenCom-CPSCom.2010.102"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-015-1483-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-015-1483-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-015-1483-z","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,8,28]],"date-time":"2019-08-28T10:25:11Z","timestamp":1566987911000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-015-1483-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,7,22]]},"references-count":23,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2015,10]]}},"alternative-id":["1483"],"URL":"https:\/\/doi.org\/10.1007\/s11227-015-1483-z","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2015,7,22]]}}}