{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:44:28Z","timestamp":1740123868109,"version":"3.37.3"},"reference-count":66,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2016,12,28]],"date-time":"2016-12-28T00:00:00Z","timestamp":1482883200000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2018,4]]},"DOI":"10.1007\/s10766-016-0482-x","type":"journal-article","created":{"date-parts":[[2016,12,28]],"date-time":"2016-12-28T13:51:58Z","timestamp":1482933118000},"page":"336-375","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Accelerating Data Analytics on Integrated GPU Platforms via Runtime Specialization"],"prefix":"10.1007","volume":"46","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6592-5328","authenticated-orcid":false,"given":"Naila","family":"Farooqui","sequence":"first","affiliation":[]},{"given":"Indrajit","family":"Roy","sequence":"additional","affiliation":[]},{"given":"Yuan","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Vanish","family":"Talwar","sequence":"additional","affiliation":[]},{"given":"Rajkishore","family":"Barik","sequence":"additional","affiliation":[]},{"given":"Brian","family":"Lewis","sequence":"additional","affiliation":[]},{"given":"Tatiana","family":"Shpeisman","sequence":"additional","affiliation":[]},{"given":"Karsten","family":"Schwan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,12,28]]},"reference":[{"key":"482_CR1","unstructured":"Compute architecture of intel processor graphics. https:\/\/software.intel.com\/en-us\/file\/compute-architecture-of-intel-processor-graphics-gen8pdf"},{"key":"482_CR2","unstructured":"Intel thread building blocks. www.threadbuildingblocks.org"},{"key":"482_CR3","doi-asserted-by":"publisher","unstructured":"Agrawal, K., He, Y., Leiserson, C.E.: Adaptive work stealing with parallelism feedback. In: Proceedings of the 12th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP \u201907, pp. 112\u2013120. ACM, New York (2007). doi: 10.1145\/1229428.1229448","DOI":"10.1145\/1229428.1229448"},{"key":"482_CR4","unstructured":"AMD: AMD APP SDK. AMD, 2.9 edn"},{"key":"482_CR5","unstructured":"AMD: CodeXL. AMD, 3.1 edn"},{"key":"482_CR6","doi-asserted-by":"crossref","unstructured":"Ariel, A., Fung, W.W.L., Turner, A.E., Aamodt, T.M.: Visualizing complex dynamics in many-core accelerator architectures. In: IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), pp. 164\u2013174. IEEE Computer Society, White Plains, NY, USA (2010)","DOI":"10.1109\/ISPASS.2010.5452029"},{"issue":"2","key":"482_CR7","doi-asserted-by":"crossref","first-page":"187","DOI":"10.1002\/cpe.1631","volume":"23","author":"C Augonnet","year":"2011","unstructured":"Augonnet, C., Thibault, S., Namyst, R., Wacrenier, P.A.: Starpu: a unified platform for task scheduling on heterogeneous multicore architectures. Concurr Comput Pract Exp 23(2), 187\u2013198 (2011)","journal-title":"Concurr Comput Pract Exp"},{"key":"482_CR8","doi-asserted-by":"publisher","unstructured":"Baghsorkhi, S.S., Delahaye, M., Patel, S.J., Gropp, W.D., Hwu, W.M.W.: An adaptive performance modeling tool for gpu architectures. In: Proceedings of the 15th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP \u201910, pp. 105\u2013114. ACM, New York (2010). doi: 10.1145\/1693453.1693470","DOI":"10.1145\/1693453.1693470"},{"key":"482_CR9","doi-asserted-by":"crossref","unstructured":"Bakhoda, A., Yuan, G., Fung, W.W.L., Wong, H., Aamodt, T.M.: Analyzing cuda workloads using a detailed gpu simulator. In: IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), pp. 163\u2013174. Boston, MA, USA (2009)","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"482_CR10","doi-asserted-by":"crossref","unstructured":"Barik, R., Kaleem, R., Majeti, D., Lewis, B.T., Shpeisman, T., Hu, C., Ni, Y., Adl-Tabatabai, A.R.: Efficient mapping of irregular c++ applications to integrated gpus. In: Proceedings of Annual IEEE\/ACM International Symposium on Code Generation and Optimization, CGO \u201914, pp. 33:33\u201333:43. ACM, New York, NY, USA (2014)","DOI":"10.1145\/2581122.2544165"},{"key":"482_CR11","doi-asserted-by":"publisher","unstructured":"Becchi, M., Sajjapongse, K., Graves, I., Procter, A., Ravi, V., Chakradhar, S.: A virtual memory based runtime to support multi-tenancy in clusters with gpus. In: Proceedings of the 21st International Symposium on High-Performance Parallel and Distributed Computing, HPDC \u201912, pp. 97\u2013108. ACM, New York, NY, USA (2012). doi: 10.1145\/2287076.2287090","DOI":"10.1145\/2287076.2287090"},{"key":"482_CR12","doi-asserted-by":"publisher","unstructured":"Bender, M.A., Rabin, M.O.: Scheduling cilk multithreaded parallel programs on processors of different speeds. In: Proceedings of the Twelfth Annual ACM Symposium on Parallel Algorithms and Architectures, SPAA \u201900, pp. 13\u201321. ACM, New York (2000). doi: 10.1145\/341800.341803","DOI":"10.1145\/341800.341803"},{"issue":"5","key":"482_CR13","doi-asserted-by":"publisher","first-page":"720","DOI":"10.1145\/324133.324234","volume":"46","author":"RD Blumofe","year":"1999","unstructured":"Blumofe, R.D., Leiserson, C.E.: Scheduling multithreaded computations by work stealing. J. ACM 46(5), 720\u2013748 (1999). doi: 10.1145\/324133.324234","journal-title":"J. ACM"},{"key":"482_CR14","doi-asserted-by":"crossref","unstructured":"Boyer, M., Skadron, K., Che, S., Jayasena, N.: Load balancing in a changing world: dealing with heterogeneity and performance variability. In: Proceedings of the ACM International Conference on Computing Frontiers, CF \u201913, pp. 21:1\u201321:10. ACM, New York (2013)","DOI":"10.1145\/2482767.2482794"},{"key":"482_CR15","doi-asserted-by":"publisher","unstructured":"Burtscher, M., Nasre, R., Pingali, K.: A quantitative study of irregular programs on gpus. In: 2012 IEEE International Symposium on Workload Characterization (IISWC), pp. 141\u2013151 (2012). doi: 10.1109\/IISWC.2012.6402918","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"482_CR16","unstructured":"Cederman, D., Tsigas, P.: On dynamic load balancing on graphics processors. In: Proceedings of the 23rd ACM SIGGRAPH\/EUROGRAPHICS Symposium on Graphics Hardware. GH \u201908, pp. 57\u201364. Aire-la-Ville, Switzerland (2008)"},{"key":"482_CR17","doi-asserted-by":"publisher","unstructured":"Chase, D., Lev, Y.: Dynamic circular work-stealing deque. In: Proceedings of the Seventeenth Annual ACM Symposium on Parallelism in Algorithms and Architectures, SPAA \u201905, pp. 21\u201328. ACM, New York (2005). doi: 10.1145\/1073970.1073974","DOI":"10.1145\/1073970.1073974"},{"key":"482_CR18","doi-asserted-by":"publisher","unstructured":"Chatterjee, S., Grossman, M., Sb\u00eerlea, A.S., Sarkar, V.: Dynamic task parallelism with a GPU work-stealing runtime system. In: Rajopadhye, S.V., Strout, M.M. (eds.) Languages and Compilers for Parallel Computing, 24th International Workshop, LCPC 2011, Fort Collins, CO, USA, September 8\u201310, 2011. Revised Selected Papers, Lecture Notes in Computer Science, vol. 7146, pp. 203\u2013217. Springer (2011). doi: 10.1007\/978-3-642-36036-7_14","DOI":"10.1007\/978-3-642-36036-7_14"},{"key":"482_CR19","doi-asserted-by":"publisher","unstructured":"Che, S., Boyer, M., Meng, J., Tarjan, D., Sheaffer, J., Lee, S.H., Skadron, K.: Rodinia: A benchmark suite for heterogeneous computing. In: IEEE International Symposium on Workload Characterization, 2009. IISWC 2009, pp. 44\u201354 (2009). doi: 10.1109\/IISWC.2009.5306797","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"482_CR20","doi-asserted-by":"publisher","unstructured":"Chen, L., Villa, O., Krishnamoorthy, S., Gao, G.: Dynamic load balancing on single- and multi-GPU systems. In: IEEE International Symposium on Parallel Distributed Processing (IPDPS), pp. 1\u201312 (2010). doi: 10.1109\/IPDPS.2010.5470413","DOI":"10.1109\/IPDPS.2010.5470413"},{"key":"482_CR21","doi-asserted-by":"crossref","unstructured":"Collange, S., Defour, D., Parello, D.: Barra, a modular functional gpu simulator for gpgpu. Tech. Rep. hal-00359342 (2009)","DOI":"10.1109\/MASCOTS.2010.43"},{"key":"482_CR22","doi-asserted-by":"publisher","unstructured":"Farooqui, N., Kerr, A., Eisenhauer, G., Schwan, K., Yalamanchili, S.: Lynx: A dynamic instrumentation system for data-parallel applications on gpgpu architectures. In: 2012 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), pp. 58 \u201367 (2012). doi: 10.1109\/ISPASS.2012.6189206","DOI":"10.1109\/ISPASS.2012.6189206"},{"key":"482_CR23","unstructured":"Gautier, T., Ferreira\u00a0Lima, J.V., Maillard, N., Raffin, B.: Locality-aware work stealing on multi-CPU and multi-GPU architectures. In: 6th Workshop on Programmability Issues for Heterogeneous Multicores (MULTIPROG). Berlin, Germany (2013). https:\/\/hal.inria.fr\/hal-00780890"},{"key":"482_CR24","doi-asserted-by":"publisher","unstructured":"Goswami, N., Shankar, R., Joshi, M., Li, T.: Exploring gpgpu workloads: Characterization methodology, analysis and microarchitecture evaluation implications. In: 2010 IEEE International Symposium on Workload Characterization (IISWC), pp. 1\u201310 (2010). doi: 10.1109\/IISWC.2010.5649549","DOI":"10.1109\/IISWC.2010.5649549"},{"key":"482_CR25","doi-asserted-by":"publisher","unstructured":"Grewe, D., Wang, Z., O\u2019Boyle, M.: Portable mapping of data parallel programs to OpenCL for heterogeneous systems. In: IEEE\/ACM International Symposium on Code Generation and Optimization (CGO), pp. 1\u201310 (2013). doi: 10.1109\/CGO.2013.6494993","DOI":"10.1109\/CGO.2013.6494993"},{"key":"482_CR26","unstructured":"Grewe, D., Wang, Z., O\u2019Boyle, M.F.P.: Portable mapping of data parallel programs to opencl for heterogeneous systems. In: IEEE Computer Society CGO, pp. 22:1\u201322:10 (2013). http:\/\/dblp.uni-trier.de\/db\/conf\/cgo\/cgo2013.html#GreweWO13"},{"key":"482_CR27","unstructured":"Group, K.O.W.: The OpenCL Specification (2008). http:\/\/www.khronos.org\/registry\/cl\/specs\/opencl-1.0.29.pdf"},{"key":"482_CR28","doi-asserted-by":"publisher","unstructured":"Guo, Y., Zhao, J., Cave, V., Sarkar, V.: Slaw: A scalable locality-aware adaptive work-stealing scheduler for multi-core systems. In: Proceedings of the 15th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP \u201910, pp. 341\u2013342. ACM, New York (2010). doi: 10.1145\/1693453.1693504","DOI":"10.1145\/1693453.1693504"},{"key":"482_CR29","unstructured":"Gupta, V., Schwan, K., Tolia, N., Talwar, V., Ranganathan, P.: Pegasus: Coordinated scheduling for virtualized accelerator-based systems. In: Proceedings of the 2011 Usenix Annual Technical Conference, Portland, USA (2011)"},{"key":"482_CR30","doi-asserted-by":"crossref","unstructured":"Hong, S., Kim, S.K., Oguntebi, T., Olukotun, K.: Accelerating cuda graph algorithms at maximum warp. In: Proceedings of the 16th ACM Symposium on Principles and Practice of Parallel Programming. PPoPP \u201911, pp. 267\u2013276. ACM, New York (2011)","DOI":"10.1145\/1941553.1941590"},{"key":"482_CR31","unstructured":"IMPACT: The parboil benchmark suite (2007). http:\/\/www.crhc.uiuc.edu\/IMPACT\/parboil.php"},{"key":"482_CR32","doi-asserted-by":"publisher","unstructured":"Jim\u00e9nez, V.J., Vilanova, L., Gelado, I., Gil, M., Fursin, G., Navarro, N.: Predictive runtime code scheduling for heterogeneous architectures. In: Proceedings of the 4th International Conference on High Performance Embedded Architectures and Compilers, HiPEAC \u201909, pp. 19\u201333. Springer, Berlin, Heidelberg (2009). doi: 10.1007\/978-3-540-92990-1_4","DOI":"10.1007\/978-3-540-92990-1_4"},{"key":"482_CR33","doi-asserted-by":"publisher","unstructured":"Kaleem, R., Barik, R., Shpeisman, T., Lewis, B.T., Hu, C., Pingali, K.: Adaptive heterogeneous scheduling for integrated gpus. In: Proceedings of the 23rd International Conference on Parallel Architectures and Compilation, PACT \u201914, pp. 151\u2013162. ACM, New York (2014). doi: 10.1145\/2628071.2628088","DOI":"10.1145\/2628071.2628088"},{"key":"482_CR34","unstructured":"Kato, S., Lakshmanan, K., Rajkumar, R., Ishikawa, Y.: Timegraph: Gpu scheduling for real-time multi-tasking environments. In: Proceedings of the 2011 USENIX Conference on USENIX Annual Technical Conference. USENIXATC\u201911, pp. 2\u20132. USENIX Association, Berkeley, CA, USA (2011)"},{"key":"482_CR35","unstructured":"Kato, S., McThrow, M., Maltzahn, C., Brandt, S.: Gdev: First-class gpu resource management in the operating system. In: Proceedings of the 2012 USENIX Conference on Annual Technical Conference. USENIX ATC\u201912, pp. 37\u201337. USENIX Association, Berkeley, CA, USA (2012)"},{"key":"482_CR36","doi-asserted-by":"publisher","unstructured":"Kerr, A., Diamos, G., Yalamanchili, S.: A characterization and analysis of ptx kernels. In: IEEE International Symposium on Workload Characterization, 2009. IISWC 2009, pp. 3\u201312 (2009). doi: 10.1109\/IISWC.2009.5306801","DOI":"10.1109\/IISWC.2009.5306801"},{"key":"482_CR37","doi-asserted-by":"publisher","unstructured":"Kim, J., Kim, H., Lee, J.H., Lee, J.: Achieving a single compute device image in OpenCL for multiple GPUs. In: Proceedings of the 16th ACM symposium on Principles and practice of parallel programming, PPoPP \u201911, pp. 277\u2013288. ACM, NY, USA (2011). doi: 10.1145\/1941553.1941591","DOI":"10.1145\/1941553.1941591"},{"key":"482_CR38","doi-asserted-by":"crossref","unstructured":"Kim, S., Roy, I., Talwar, V.: Evaluating integrated graphics processors for data center workloads. In: Proceedings of the Workshop on Power-Aware Computing and Systems, HotPower \u201913, pp. 8:1\u20138:5. ACM, New York, NY, USA (2013)","DOI":"10.1145\/2525526.2525847"},{"key":"482_CR39","doi-asserted-by":"publisher","unstructured":"Kumar, V., Frampton, D., Blackburn, S.M., Grove, D., Tardieu, O.: Work-stealing without the baggage. In: Proceedings of the ACM International Conference on Object Oriented Programming Systems Languages and Applications, OOPSLA \u201912, pp. 297\u2013314. ACM, New York (2012). doi: 10.1145\/2384616.2384639","DOI":"10.1145\/2384616.2384639"},{"key":"482_CR40","doi-asserted-by":"publisher","unstructured":"L\u00ea, N.M., Pop, A., Cohen, A., Zappa\u00a0Nardelli, F.: Correct and efficient work-stealing for weak memory models. In: Proceedings of the 18th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP \u201913, pp. 69\u201380. ACM, New York (2013). doi: 10.1145\/2442516.2442524","DOI":"10.1145\/2442516.2442524"},{"key":"482_CR41","unstructured":"Lee, J., Samadi, M., Park, Y., Mahlke, S.: Transparent CPU\u2013GPU collaboration for data-parallel kernels on heterogeneous systems. In: Proceedings of the 22nd international conference on Parallel architectures and compilation techniques, PACT (2013)"},{"key":"482_CR42","doi-asserted-by":"crossref","unstructured":"Lee, K., Liu, L.: Efficient data partitioning model for heterogeneous graphs in the cloud. In: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, SC \u201913, pp. 46:1\u201346:12. ACM, New York (2013)","DOI":"10.1145\/2503210.2503302"},{"key":"482_CR43","doi-asserted-by":"publisher","unstructured":"Li, D., Becchi, M.: Deploying graph algorithms on gpus: an adaptive solution. In: 2013 IEEE 27th International Symposium on Parallel Distributed Processing (IPDPS), pp. 1013\u20131024 (2013). doi: 10.1109\/IPDPS.2013.101","DOI":"10.1109\/IPDPS.2013.101"},{"issue":"8","key":"482_CR44","doi-asserted-by":"crossref","first-page":"716","DOI":"10.14778\/2212351.2212354","volume":"5","author":"Y Low","year":"2012","unstructured":"Low, Y., Bickson, D., Gonzalez, J., Guestrin, C., Kyrola, A., Hellerstein, J.M.: Distributed graphlab: a framework for machine learning and data mining in the cloud. Proc. VLDB Endow. 5(8), 716\u2013727 (2012)","journal-title":"Proc. VLDB Endow."},{"key":"482_CR45","doi-asserted-by":"publisher","unstructured":"Luk, C.K., Hong, S., Kim, H.: Qilin: exploiting parallelism on heterogeneous multiprocessors with adaptive mapping. In: Proceedings of the 42nd Annual IEEE\/ACM International Symposium on Microarchitecture, MICRO 42, pp. 45\u201355. ACM, New York (2009). doi: 10.1145\/1669112.1669121","DOI":"10.1145\/1669112.1669121"},{"key":"482_CR46","doi-asserted-by":"crossref","unstructured":"Luo, L., Wong, M., Hwu, W.M.: An effective gpu implementation of breadth-first search. In: Proceedings of the 47th design automation conference, pp. 52\u201355. ACM (2010)","DOI":"10.1145\/1837274.1837289"},{"key":"482_CR47","doi-asserted-by":"publisher","unstructured":"Menychtas, K., Shen, K., Scott, M.L.: Disengaged scheduling for fair, protected access to fast computational accelerators. In: Proceedings of the 19th International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS \u201914, pp. 301\u2013316. ACM, New York (2014). doi: 10.1145\/2541940.2541963","DOI":"10.1145\/2541940.2541963"},{"key":"482_CR48","unstructured":"Min, S.J., Iancu, C., Yelick, K.: Hierarchical work stealing on manycore clusters. In: In Fifth Conference on Partitioned Global Address Space Programming Models (2011)"},{"key":"482_CR49","doi-asserted-by":"crossref","unstructured":"Nguyen, D., Lenharth, A., Pingali, K.: A lightweight infrastructure for graph analytics. Proceedings of the Twenty-Fourth ACM Symposium on Operating Systems Principles. SOSP \u201913, pp. 456\u2013471. ACM, New York (2013)","DOI":"10.1145\/2517349.2522739"},{"key":"482_CR50","unstructured":"Nilakant, K., Yoneki, E.: On the efficacy of apus for heterogeneous graph computation. In: Fourth Workshop on Systems for Future Multicore Architectures (2014)"},{"key":"482_CR51","unstructured":"NVIDIA: NVIDIA Compute Visual Profiler. NVIDIA Corporation, Santa Clara, California, 4.0 edn. (2011)"},{"key":"482_CR52","unstructured":"NVIDIA: NVIDIA CUDA Tools SDK CUPTI. NVIDIA Corporation, Santa Clara, California, 1.0 edn. (2011)"},{"key":"482_CR53","doi-asserted-by":"publisher","unstructured":"Pandit, P., Govindarajan, R.: Fluidic kernels: cooperative execution of opencl programs on multiple heterogeneous devices. In: Proceedings of Annual IEEE\/ACM International Symposium on Code Generation and Optimization, CGO \u201914, pp. 273:273\u2013273:283. ACM, New York (2014). doi: 10.1145\/2544137.2544163","DOI":"10.1145\/2544137.2544163"},{"key":"482_CR54","doi-asserted-by":"publisher","unstructured":"Phull, R., Li, C.H., Rao, K., Cadambi, H., Chakradhar, S.: Interference-driven resource management for gpu-based heterogeneous clusters. In: Proceedings of the 21st international symposium on High-Performance Parallel and Distributed Computing, HPDC \u201912, pp. 109\u2013120. ACM, New York (2012). doi: 10.1145\/2287076.2287091","DOI":"10.1145\/2287076.2287091"},{"key":"482_CR55","doi-asserted-by":"publisher","unstructured":"Ravi, V.T., Becchi, M., Agrawal, G., Chakradhar, S.: Supporting gpu sharing in cloud environments with a transparent runtime consolidation framework. In: Proceedings of the 20th international symposium on High performance distributed computing, HPDC \u201911, pp. 217\u2013228. ACM, New York (2011). doi: 10.1145\/1996130.1996160","DOI":"10.1145\/1996130.1996160"},{"key":"482_CR56","doi-asserted-by":"crossref","unstructured":"Ravi, V.T., Becchi, M., Jiang, W., Agrawal, G., Chakradhar, S.: Scheduling concurrent applications on a cluster of cpu\u2013gpu nodes. In: Proceedings of the 2012 12th IEEE\/ACM International Symposium on Cluster. Cloud and Grid Computing (ccgrid 2012), CCGRID \u201912, pp. 140\u2013147. IEEE Computer Society, Washington (2012)","DOI":"10.1109\/CCGrid.2012.78"},{"key":"482_CR57","doi-asserted-by":"publisher","unstructured":"Ravi, V.T., Ma, W., Chiu, D., Agrawal, G.: Compiler and runtime support for enabling generalized reduction computations on heterogeneous parallel configurations. In: Proceedings of the 24th ACM International Conference on Supercomputing, ICS \u201910, pp. 137\u2013146. ACM, New York (2010). doi: 10.1145\/1810085.1810106","DOI":"10.1145\/1810085.1810106"},{"key":"482_CR58","doi-asserted-by":"publisher","unstructured":"Ribic, H., Liu, Y.D.: Energy-efficient work-stealing language runtimes. In: Proceedings of the 19th International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS \u201914, pp. 513\u2013528. ACM, New York (2014). doi: 10.1145\/2541940.2541971","DOI":"10.1145\/2541940.2541971"},{"key":"482_CR59","doi-asserted-by":"crossref","unstructured":"Rossbach, C.J., Currey, J., Silberstein, M., Ray, B., Witchel, E.: Ptask: operating system abstractions to manage gpus as compute devices. In: Proceedings of the Twenty-Third ACM Symposium on Operating Systems Principles. SOSP \u201911, pp. 233\u2013248. ACM, New York (2011)","DOI":"10.1145\/2043556.2043579"},{"key":"482_CR60","doi-asserted-by":"crossref","unstructured":"Rossbach, C.J., Yu, Y., Currey, J., Martin, J.P., Fetterly, D.: Dandelion: a compiler and runtime for heterogeneous systems. In: Proceedings of the Twenty-Fourth ACM Symposium on Operating Systems Principles. SOSP \u201913, pp. 49\u201368. ACM, New York (2013)","DOI":"10.1145\/2517349.2522715"},{"key":"482_CR61","doi-asserted-by":"publisher","unstructured":"Sb\u00eerlea, A., Zou, Y., Budiml\u00edc, Z., Cong, J., Sarkar, V.: Mapping a data-flow programming model onto heterogeneous platforms. In: Proceedings of the 13th ACM SIGPLAN\/SIGBED International Conference on Languages, Compilers, Tools and Theory for Embedded Systems, LCTES \u201912, pp. 61\u201370. ACM, New York (2012). doi: 10.1145\/2248418.2248428","DOI":"10.1145\/2248418.2248428"},{"key":"482_CR62","doi-asserted-by":"publisher","unstructured":"Schaa, D., Kaeli, D.: Exploring the multiple-GPU design space. In: IEEE International Symposium on Parallel Distributed Processing. IPDPS., pp. 1\u201312 (2009). doi: 10.1109\/IPDPS.2009.5161068","DOI":"10.1109\/IPDPS.2009.5161068"},{"key":"482_CR63","doi-asserted-by":"publisher","unstructured":"Scogland, T., Rountree, B., chun Feng, W., De\u00a0Supinski, B.: Heterogeneous task scheduling for accelerated OpenMP. In: IEEE 26th International Parallel Distributed Processing Symposium (IPDPS), pp. 144\u2013155 (2012). doi: 10.1109\/IPDPS.2012.23","DOI":"10.1109\/IPDPS.2012.23"},{"key":"482_CR64","doi-asserted-by":"publisher","unstructured":"Wang, L., Cui, H., Duan, Y., Lu, F., Feng, X., Yew, P.C.: An adaptive task creation strategy for work-stealing scheduling. In: Proceedings of the 8th Annual IEEE\/ACM International Symposium on Code Generation and Optimization, CGO \u201910, pp. 266\u2013277. ACM, New York (2010). doi: 10.1145\/1772954.1772992","DOI":"10.1145\/1772954.1772992"},{"key":"482_CR65","doi-asserted-by":"crossref","unstructured":"Wu, H., Diamos, G., Sheard, T., Aref, M., Baxter, S., Garland, M., Yalamanchili, S.: Red fox: an execution environment for relational query processing on gpus. In: Proceedings of Annual IEEE\/ACM International Symposium on Code Generation and Optimization, CGO \u201914, pp. 44:44\u201344:54. ACM, New York (2014)","DOI":"10.1145\/2581122.2544166"},{"key":"482_CR66","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Owens, J.D.: A quantitative performance analysis model for gpu architectures. In: 17th International Conference on High-Performance Computer Architecture (HPCA-17), pp. 382\u2013393. IEEE Computer Society, San Antonio, TX, USA (2011)","DOI":"10.1109\/HPCA.2011.5749745"}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10766-016-0482-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-016-0482-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-016-0482-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,9,16]],"date-time":"2019-09-16T21:26:37Z","timestamp":1568669197000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10766-016-0482-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,12,28]]},"references-count":66,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2018,4]]}},"alternative-id":["482"],"URL":"https:\/\/doi.org\/10.1007\/s10766-016-0482-x","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"type":"print","value":"0885-7458"},{"type":"electronic","value":"1573-7640"}],"subject":[],"published":{"date-parts":[[2016,12,28]]}}}