{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T03:24:10Z","timestamp":1740108250491,"version":"3.37.3"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2021,6,3]],"date-time":"2021-06-03T00:00:00Z","timestamp":1622678400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,6,3]],"date-time":"2021-06-03T00:00:00Z","timestamp":1622678400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Computing"],"published-print":{"date-parts":[[2021,10]]},"DOI":"10.1007\/s00607-021-00958-2","type":"journal-article","created":{"date-parts":[[2021,6,3]],"date-time":"2021-06-03T13:05:38Z","timestamp":1622725538000},"page":"2171-2202","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["FusionCL: a machine-learning based approach for OpenCL kernel fusion to increase system performance"],"prefix":"10.1007","volume":"103","author":[{"given":"Yasir Noman","family":"Khalid","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8342-5757","authenticated-orcid":false,"given":"Muhammad","family":"Aleem","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Usman","family":"Ahmed","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Radu","family":"Prodan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Muhammad Arshad","family":"Islam","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Muhammad Azhar","family":"Iqbal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,6,3]]},"reference":[{"key":"958_CR1","doi-asserted-by":"publisher","first-page":"259","DOI":"10.1016\/j.future.2020.07.017","volume":"114","author":"T Rausch","year":"2021","unstructured":"Rausch T, Rashed R, Dustdar S (2021) Optimized container scheduling for data-intensive serverless edge computing. Futur Gener Comput Syst 114:259\u2013271. https:\/\/doi.org\/10.1016\/j.future.2020.07.017","journal-title":"Futur Gener Comput Syst"},{"key":"958_CR2","doi-asserted-by":"publisher","first-page":"79","DOI":"10.1016\/J.JPDC.2019.05.015","volume":"132","author":"YN Khalid","year":"2019","unstructured":"Khalid YN, Aleem M, Ahmed U, Islam MA, Iqbal MA (2019) Troodon: a machine-learning based load-balancing application scheduler for CPU\u2013GPU system. J Parallel Distrib Comput 132:79\u201394. https:\/\/doi.org\/10.1016\/J.JPDC.2019.05.015","journal-title":"J Parallel Distrib Comput"},{"key":"958_CR3","doi-asserted-by":"publisher","unstructured":"Rohr D et al (2014) An energy-efficient multi-GPU supercomputer. In: 2014 IEEE international conference on high performance computing and communications, 2014 IEEE 6th international symposium on cyberspace safety and security, 2014 IEEE 11th international conference on embedded software and system (HPCC, CSS, ICESS). pp 42\u201345. https:\/\/doi.org\/10.1109\/HPCC.2014.14","DOI":"10.1109\/HPCC.2014.14"},{"key":"958_CR4","doi-asserted-by":"crossref","unstructured":"Jog A et al (2015) Anatomy of GPU memory system for multi-application execution. In: Proceedings of the 2015 international symposium on memory systems. pp 223\u2013234. https:\/\/www.cs.utexas.edu\/~skeckler\/pubs\/MEMSYS_2015_Anatomy.pdf. Accessed 31 Jul 2019","DOI":"10.1145\/2818950.2818979"},{"key":"958_CR5","doi-asserted-by":"publisher","unstructured":"Papadimitriou M, Markou E, Fumero J, Stratikopoulos A, Blanaru F, Kotselidis C (2021) Multiple-tasks on multiple-devices (MTMD): exploiting concurrency in heterogeneous managed runtimes. In: Proceedings of the 17th ACM SIGPLAN\/SIGOPS international conference on virtual execution environments. pp 125\u2013138. https:\/\/doi.org\/10.1145\/3453933.3454019","DOI":"10.1145\/3453933.3454019"},{"key":"958_CR6","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-018-2435-1","author":"YN Khalid","year":"2018","unstructured":"Khalid YN, Aleem M, Prodan R, Iqbal MA, Islam MA (2018) E-OSched: a load balancing scheduler for heterogeneous multicores. J Supercomput. https:\/\/doi.org\/10.1007\/s11227-018-2435-1","journal-title":"J Supercomput"},{"key":"958_CR7","unstructured":"OpenCL overview\u2014The Khronos Group Inc (2021). https:\/\/www.khronos.org\/opencl\/ Accessed 02 May 2021"},{"key":"958_CR8","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.5606","author":"U Ahmed","year":"2019","unstructured":"Ahmed U, Aleem M, Noman Khalid Y, Arshad Islam M, Azhar Iqbal M (2019) RALB-HC: a resource-aware load balancer for heterogeneous cluster. Concurr Comput Pract Exp. https:\/\/doi.org\/10.1002\/cpe.5606","journal-title":"Concurr Comput Pract Exp"},{"key":"958_CR9","doi-asserted-by":"crossref","unstructured":"Munshi A (2009) The OpenCL specification. In: Hot chips 21 symposium (HCS). IEEE, pp 1\u2013314. https:\/\/www.khronos.org\/registry\/OpenCL\/specs\/opencl-1.2.pdf. Accessed 30 Oct 2017","DOI":"10.1109\/HOTCHIPS.2009.7478342"},{"key":"958_CR10","volume-title":"Multi-tasking scheduling for heterogeneous systems","author":"Y Wen","year":"2017","unstructured":"Wen Y (2017) Multi-tasking scheduling for heterogeneous systems. University of Edinburgh, Edinburgh"},{"key":"958_CR11","unstructured":"AMD E-Series E2\u20137110 Notebook Processor\u2014NotebookCheck.net Tech. https:\/\/www.notebookcheck.net\/AMD-E-Series-E2-7110-Notebook-Processor.144996.0.html. Accessed 03 May 2021"},{"issue":"3","key":"958_CR12","doi-asserted-by":"publisher","first-page":"451","DOI":"10.1145\/1815961.1816021","volume":"38","author":"VW Lee","year":"2010","unstructured":"Lee VW et al (2010) Debunking the 100X GPU vs. CPU myth: An evaluation of throughput computing on CPU and GPU. Isca 38(3):451\u2013460. https:\/\/doi.org\/10.1145\/1815961.1816021","journal-title":"Isca"},{"issue":"3","key":"958_CR13","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/3430936","volume":"64","author":"NC Thompson","year":"2021","unstructured":"Thompson NC, Spanuth S (2021) The decline of computers as a general purpose technology. Commun ACM 64(3):64\u201372. https:\/\/doi.org\/10.1145\/3430936","journal-title":"Commun ACM"},{"issue":"3","key":"958_CR14","doi-asserted-by":"publisher","first-page":"201","DOI":"10.1145\/2508148.2485940","volume":"41","author":"BA Hechtman","year":"2013","unstructured":"Hechtman BA, Sorin DJ (2013) exploring memory consistency for massively-threaded throughput-oriented processors. ACM SIGARCH Comput Archit News 41(3):201\u2013212","journal-title":"ACM SIGARCH Comput Archit News"},{"issue":"9","key":"958_CR15","doi-asserted-by":"publisher","first-page":"1941","DOI":"10.1007\/s00607-020-00827-4","volume":"102","author":"U Kiran","year":"2020","unstructured":"Kiran U, Gautam SS, Sharma D (2020) GPU-based matrix-free finite element solver exploiting symmetry of elemental matrices. Computing 102(9):1941\u20131965. https:\/\/doi.org\/10.1007\/s00607-020-00827-4","journal-title":"Computing"},{"key":"958_CR16","unstructured":"Lee S-Y, Wu C-J (2018) Performance characterization, prediction, and optimization for heterogeneous systems with multi-level memory interference. In: 2017 IEEE international symposium on workload characterization (IISWC), pp 43\u201353. [Online]. https:\/\/pdfs.semanticscholar.org\/bfed\/ce6668172edbec76fc67c29f7a320979c110.pdf. Accessed 07 Feb 2018"},{"key":"958_CR17","doi-asserted-by":"publisher","unstructured":"Baruah T et al (2020) Valkyrie: leveraging inter-TLB locality to enhance GPU performance. In: Parallel architectures and compilation techniques\u2014conference proceedings, PACT, pp 456\u2013466. https:\/\/doi.org\/10.1145\/3410463.3414639","DOI":"10.1145\/3410463.3414639"},{"issue":"12","key":"958_CR18","doi-asserted-by":"publisher","first-page":"2607","DOI":"10.1007\/s00607-020-00846-1","volume":"102","author":"H Kang","year":"2020","unstructured":"Kang H, Kwon HC, Kim D (2020) HPMaX: heterogeneous parallel matrix multiplication using CPUs and GPUs. Computing 102(12):2607\u20132631. https:\/\/doi.org\/10.1007\/s00607-020-00846-1","journal-title":"Computing"},{"key":"958_CR19","doi-asserted-by":"publisher","unstructured":"Chilingaryan S, Kopmann A, Ametova E, Mirone A (2018) ESRF: balancing load of GPU subsystems to accelerate image reconstruction in parallel beam tomography. In: 30th international symposium on computer architecture and high performance computing (SBAC-PAD). pp 158\u2013166. https:\/\/doi.org\/10.1109\/CAHPC.2018.8645862","DOI":"10.1109\/CAHPC.2018.8645862"},{"key":"958_CR20","doi-asserted-by":"publisher","unstructured":"Shen M, Luo G (2017) Corolla: GPU-accelerated FPGA routing based on subgraph dynamic expansion. In: Proceedings of the 2017 ACM\/SIGDA international symposium on field-programmable gate arrays. pp 105\u2013114. https:\/\/doi.org\/10.1145\/3020078.3021732","DOI":"10.1145\/3020078.3021732"},{"key":"958_CR21","doi-asserted-by":"crossref","unstructured":"Zhao Z, Song L, Xie R, Yang X (2016) GPU accelerated high-quality video\/image super-resolution. In: 2016 IEEE international symposium on broadband multimedia systems and broadcasting (BMSB). pp 1\u20134. [Online]. http:\/\/medialab.sjtu.edu.cn\/publications\/2016\/BMSB2016_ZhaoSongYangXie.pdf. Accessed 28 Jun 2019","DOI":"10.1109\/BMSB.2016.7521938"},{"key":"958_CR22","doi-asserted-by":"publisher","unstructured":"Sun Y et al (2019) MGPUSim: enabling multi-GPU performance modeling and optimization. In: Proceedings\u2014international symposium on computer architecture. pp 197\u2013209. https:\/\/doi.org\/10.1145\/3307650.3322230","DOI":"10.1145\/3307650.3322230"},{"issue":"2","key":"958_CR23","doi-asserted-by":"publisher","first-page":"503","DOI":"10.1145\/3296957.3173169","volume":"53","author":"R Ausavarungnirun","year":"2018","unstructured":"Ausavarungnirun R et al (2018) MASK: redesigning the GPU memory hierarchy to support multi-application concurrency. ACM SIGPLAN Not 53(2):503\u2013518. https:\/\/doi.org\/10.1145\/3296957.3173169","journal-title":"ACM SIGPLAN Not"},{"key":"958_CR24","doi-asserted-by":"crossref","unstructured":"Grauer-Gray S, Xu L, Searles R, Ayalasomayajula S, Cavazos J (2012) Auto-tuning a high-level language targeted to GPU codes. In: Innovative parallel computing (InPar). pp 1\u201310. [Online]. https:\/\/www.eecis.udel.edu\/~searles\/resources\/autotune-HMPP.pdf. Accessed 31 Jul 2017","DOI":"10.1109\/InPar.2012.6339595"},{"key":"958_CR25","doi-asserted-by":"publisher","unstructured":"Wen Y, O\u2019Boyle MF (2017) Merge or separate? Multi-job scheduling for OpenCL Kernels on CPU\/GPU platforms. In: Proceedings of the general purpose GPUs. pp 22\u201331. https:\/\/doi.org\/10.1145\/3038228.3038235","DOI":"10.1145\/3038228.3038235"},{"issue":"2","key":"958_CR26","doi-asserted-by":"publisher","first-page":"886","DOI":"10.1007\/s11227-013-0870-6","volume":"65","author":"HJ Choi","year":"2013","unstructured":"Choi HJ, Son DO, Kang SG, Kim JM, Lee H-H, Kim CH (2013) An efficient scheduling scheme using estimated execution time for heterogeneous computing systems. J Supercomput 65(2):886\u2013902. https:\/\/doi.org\/10.1007\/s11227-013-0870-6","journal-title":"J Supercomput"},{"key":"958_CR27","doi-asserted-by":"publisher","unstructured":"Wen Y, O\u2019Boyle MFP, Fensch C (2018) MaxPair: enhance OpenCL concurrent kernel execution by weighted maximum matching. In: Proceedings of the 11th workshop on general purpose GPUs. pp 40\u201349. https:\/\/doi.org\/10.1145\/3180270.3180272","DOI":"10.1145\/3180270.3180272"},{"issue":"4","key":"958_CR28","doi-asserted-by":"publisher","first-page":"407","DOI":"10.1145\/2499368.2451160","volume":"48","author":"S Pai","year":"2013","unstructured":"Pai S, Thazhuthaveetil MJ, Govindarajan R (2013) Improving GPGPU concurrency with elastic kernels. ACM SIGPLAN Not 48(4):407\u2013418. https:\/\/doi.org\/10.1145\/2499368.2451160","journal-title":"ACM SIGPLAN Not"},{"issue":"6","key":"958_CR29","doi-asserted-by":"publisher","first-page":"1522","DOI":"10.1109\/TPDS.2013.257","volume":"25","author":"J Zhong","year":"2014","unstructured":"Zhong J, He B (2014) Kernelet: high-throughput GPU kernel executions with dynamic slicing and scheduling. IEEE Trans Parallel Distrib Syst 25(6):1522\u20131532. https:\/\/doi.org\/10.1109\/TPDS.2013.257","journal-title":"IEEE Trans Parallel Distrib Syst"},{"key":"958_CR30","doi-asserted-by":"crossref","unstructured":"Wen Y, Wang Z, O\u2019boyle MFP (2014) Smart multi-task scheduling for OpenCL programs on CPU\/GPU heterogeneous platforms. In: 2014 21st international conference on high performance computing (HiPC). pp 1\u201310","DOI":"10.1109\/HiPC.2014.7116910"},{"key":"958_CR31","doi-asserted-by":"publisher","unstructured":"Margiolas C, O\u2019Boyle MFP (2016) Portable and transparent software managed scheduling on accelerators for fair resource sharing. In: Proceedings of the 2016 international symposium on code generation and optimization. pp 82\u201393. https:\/\/doi.org\/10.1145\/2854038.2854040","DOI":"10.1145\/2854038.2854040"},{"key":"958_CR32","doi-asserted-by":"publisher","unstructured":"Jiao Q, Lu M, Huynh HP, Mitra T (2015) Improving GPGPU energy-efficiency through concurrent kernel execution and DVFS. In: Proceedings of the 2015 IEEE\/ACM international symposium on code generation and optimization, CGO 2015. pp 1\u201311. https:\/\/doi.org\/10.1109\/CGO.2015.7054182","DOI":"10.1109\/CGO.2015.7054182"},{"key":"958_CR33","doi-asserted-by":"publisher","unstructured":"Belviranli ME, Khorasani F, Bhuyan LN, Gupta R (2016) CuMAS: data transfer aware multi-application scheduling for shared GPUs. In: Proceedings of the 2016 international conference on supercomputing, {ICS} 2016, Istanbul, Turkey, June 1\u20133, 2016. pp 31:1\u201331:12. https:\/\/doi.org\/10.1145\/2925426.2926271","DOI":"10.1145\/2925426.2926271"},{"key":"958_CR34","doi-asserted-by":"publisher","unstructured":"P\u00e9rez B, Bosque JL, Beivide R (2016) Simplifying programming and load balancing of data parallel applications on heterogeneous systems. In: Proceedings of the 9th annual workshop on general purpose processing using graphics processing unit\u2014GPGPU \u201916. pp 42\u201351. https:\/\/doi.org\/10.1145\/2884045.2884051","DOI":"10.1145\/2884045.2884051"},{"key":"958_CR35","doi-asserted-by":"crossref","unstructured":"Boyer M, Skadron K, Che S, Jayasena N (2013) Load balancing in a changing world: dealing with heterogeneity and performance variability. In: Proceedings of the ACM international conference on computing frontiers. p 21","DOI":"10.1145\/2482767.2482794"},{"key":"958_CR36","unstructured":"Kaleem R, Barik R, Shpeisman T, Hu C, Lewis BT, Pingali K (2017) Adaptive heterogeneous scheduling for integrated GPUs. In: Proceedings of the 23rd international conference on parallel architectures and compilation. pp 151\u2013162. [Online]. http:\/\/ai2-s2-pdfs.s3.amazonaws.com\/8db3\/c11cd85195f459b8ba82fe3326e8f86f1d52.pdf. Accessed 07 Jul 2017"},{"key":"958_CR37","unstructured":"Gregg C, Boyer M, Hazelwood K, Skadron K (2011) Dynamic heterogeneous scheduling decisions using historical runtime data. In: Proceedings of the 2nd workshop on applications for multi-and many-core processors. San Jose, CA. pp 1\u201312"},{"key":"958_CR38","doi-asserted-by":"crossref","unstructured":"Grewe MF, Dominik, O\u2019Boyle (2011) A static task partitioning approach for heterogeneous systems using OpenCL. In: International conference on compiler construction. pp 286\u2013305","DOI":"10.1007\/978-3-642-19861-8_16"},{"key":"958_CR39","doi-asserted-by":"publisher","unstructured":"Kofler K, Grasso I, Cosenza B, Fahringer T (2013) An automatic input-sensitive approach for heterogeneous task partitioning categories and subject descriptors. In: Proceedings of the 27th international ACM conference on international conference on supercomputing\u2014ICS \u201913. pp 149\u2013160. https:\/\/doi.org\/10.1145\/2464996.2465007","DOI":"10.1145\/2464996.2465007"},{"key":"958_CR40","unstructured":"Insieme Compiler Project. http:\/\/www.insieme-compiler.org\/. Accessed 02 May 2021"},{"key":"958_CR41","unstructured":"The LLVM Compiler Infrastructure Project. https:\/\/llvm.org\/. Accessed 02, May 2021"},{"issue":"8","key":"958_CR42","doi-asserted-by":"publisher","first-page":"2262","DOI":"10.1109\/CCGrid.2012.78","volume":"29","author":"VT Ravi","year":"2013","unstructured":"Ravi VT, Becchi M, Jiang W, Agrawal G, Chakradhar S (2013) Scheduling concurrent applications on a cluster of CPU\u2013GPU nodes. Futur Gener Comput Syst 29(8):2262\u20132271. https:\/\/doi.org\/10.1109\/CCGrid.2012.78","journal-title":"Futur Gener Comput Syst"},{"key":"958_CR43","doi-asserted-by":"publisher","unstructured":"Olson RS, Bartley N, Urbanowicz RJ, Moore JH (2016) Evaluation of a tree-based pipeline optimization tool for automating data science. In: Proceedings of the genetic and evolutionary computation conference 2016. pp 485\u2013492. https:\/\/doi.org\/10.1145\/2908812.2908918","DOI":"10.1145\/2908812.2908918"},{"key":"958_CR44","doi-asserted-by":"publisher","unstructured":"Laadan D, Vainshtein R, Curiel Y, Katz G, Rokach L (2020) MetaTPOT: enhancing a tree-based pipeline optimization tool using meta-learning. In: International conference on information and knowledge management, proceedings. pp 2097\u20132100. https:\/\/doi.org\/10.1145\/3340531.3412147","DOI":"10.1145\/3340531.3412147"},{"issue":"4","key":"958_CR45","doi-asserted-by":"publisher","first-page":"367","DOI":"10.1016\/S0167-9473(01)00065-2","volume":"38","author":"JH Friedman","year":"2002","unstructured":"Friedman JH (2002) Stochastic gradient boosting. Comput Stat Data Anal 38(4):367\u2013378. https:\/\/doi.org\/10.1016\/S0167-9473(01)00065-2","journal-title":"Comput Stat Data Anal"},{"issue":"6","key":"958_CR46","doi-asserted-by":"publisher","first-page":"971","DOI":"10.1007\/s10994-019-05787-1","volume":"108","author":"G Biau","year":"2019","unstructured":"Biau G, Cadre B, Rouvi\u00e8re L (2019) Accelerated gradient boosting. Mach Learn 108(6):971\u2013992. https:\/\/doi.org\/10.1007\/s10994-019-05787-1","journal-title":"Mach Learn"}],"container-title":["Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00607-021-00958-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00607-021-00958-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00607-021-00958-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,9,22]],"date-time":"2021-09-22T04:09:25Z","timestamp":1632283765000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00607-021-00958-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,3]]},"references-count":46,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2021,10]]}},"alternative-id":["958"],"URL":"https:\/\/doi.org\/10.1007\/s00607-021-00958-2","relation":{},"ISSN":["0010-485X","1436-5057"],"issn-type":[{"type":"print","value":"0010-485X"},{"type":"electronic","value":"1436-5057"}],"subject":[],"published":{"date-parts":[[2021,6,3]]},"assertion":[{"value":"12 August 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 May 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 June 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}