{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T00:18:53Z","timestamp":1773274733717,"version":"3.50.1"},"reference-count":209,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2020,7,31]],"date-time":"2020-07-31T00:00:00Z","timestamp":1596153600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,7,31]],"date-time":"2020-07-31T00:00:00Z","timestamp":1596153600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. HPC"],"published-print":{"date-parts":[[2020,12]]},"DOI":"10.1007\/s42514-020-00039-4","type":"journal-article","created":{"date-parts":[[2020,7,31]],"date-time":"2020-07-31T11:03:50Z","timestamp":1596193430000},"page":"382-400","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":54,"title":["Parallel programming models for heterogeneous many-cores: a comprehensive survey"],"prefix":"10.1007","volume":"2","author":[{"given":"Jianbin","family":"Fang","sequence":"first","affiliation":[]},{"given":"Chun","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,7,31]]},"reference":[{"key":"39_CR2","unstructured":"Abadi, M., et al.: Tensorflow: Large-scale machine learning on heterogeneous distributed systems. CoRR (2016)"},{"key":"39_CR3","unstructured":"Alfieri, R.A.: An efficient kernel-based implementation of POSIX threads. In: USENIX Summer 1994 Technical Conference. USENIX Association (1994)"},{"key":"39_CR4","unstructured":"Amd brook+ programming.: Tech. rep., AMD Corporation (2007)"},{"key":"39_CR5","unstructured":"Amd cal programming guide v2.0.: Tech. rep., AMD Corporation (2010)"},{"key":"39_CR6","unstructured":"AMD\u2019s OpenCL Implementation.: https:\/\/github.com\/RadeonOpenCompute\/ROCm-OpenCL-Runtime (2020)"},{"key":"39_CR7","unstructured":"Amini, M., et\u00a0al.: Static compilation analysis for host-accelerator communication optimization. In: Languages and Compilers for Parallel Computing, 24th International Workshop, LCPC (2011)"},{"key":"39_CR8","doi-asserted-by":"crossref","unstructured":"Andrade, G., et\u00a0al.: Parallelme: A parallel mobile engine to explore heterogeneity in mobile computing architectures. In: Euro-Par 2016: Parallel Processing\u201422nd International Conference on Parallel and Distributed Computing (2016)","DOI":"10.1007\/978-3-319-43659-3_33"},{"key":"39_CR9","unstructured":"Arevalo, A., et al.: Programming the cell broadband engine: examples and best practices (2007)"},{"key":"39_CR10","doi-asserted-by":"crossref","unstructured":"Ayguad\u00e9, E., et al.: An extension of the starss programming model for platforms with multiple GPUs. In: Euro-Par 2009 Parallel Processing (2009)","DOI":"10.1007\/978-3-642-03869-3_79"},{"key":"39_CR11","unstructured":"Bader, D.A., Agarwal, V.: FFTC: fastest Fourier transform for the IBM cell broadband engine. In: High Performance Computing, HiPC (2007)"},{"key":"39_CR12","doi-asserted-by":"publisher","first-page":"753","DOI":"10.1007\/s10766-012-0211-z","volume":"41","author":"H Bae","year":"2013","unstructured":"Bae, H., et al.: The cetus source-to-source compiler infrastructure: overview and evaluation. Int. J. Parallel Program. 41, 753\u2013767 (2013)","journal-title":"Int. J. Parallel Program."},{"key":"39_CR13","doi-asserted-by":"crossref","unstructured":"Balaprakash, P., et al.: Autotuning in high-performance computing applications. In: Proceedings of the IEEE (2018)","DOI":"10.1109\/JPROC.2018.2841200"},{"key":"39_CR14","doi-asserted-by":"crossref","unstructured":"Barker, K.J., et\u00a0al.: Entering the petaflop era: the architecture and performance of roadrunner. In: Proceedings of the ACM\/IEEE Conference on High Performance Computing, SC (2008)","DOI":"10.1109\/SC.2008.5217926"},{"key":"39_CR15","doi-asserted-by":"crossref","unstructured":"Baskaran, M.M., et\u00a0al.: Automatic c-to-cuda code generation for affine programs. In: R.\u00a0Gupta (ed.) 19th International Conference on Compiler Construction (CC) (2010)","DOI":"10.1007\/978-3-642-11970-5_14"},{"key":"39_CR16","doi-asserted-by":"crossref","unstructured":"Beckingsale, D., et\u00a0al.: Performance portable C++ programming with RAJA. In: Proceedings of the 24th ACM SIGPLAN Symposium on Principles and Practice of Parallel ProgrammiDng, PPoPP (2019)","DOI":"10.1145\/3293883.3302577"},{"key":"39_CR17","unstructured":"Beignet OpenCL.: https:\/\/www.freedesktop.org\/wiki\/ Software\/Beignet\/ (2020)"},{"key":"39_CR18","doi-asserted-by":"publisher","first-page":"359","DOI":"10.1016\/B978-0-12-385963-1.00026-5","volume-title":"GPU Computing Gems Jade Edition, Applications of GPU Computing Series","author":"N Bell","year":"2012","unstructured":"Bell, N., Hoberock, J.: Chapter 26-thrust: a productivity-oriented library for cuda. In: Mei, W., Hwu, W. (eds.) GPU Computing Gems Jade Edition, Applications of GPU Computing Series, pp. 359\u2013371. Morgan Kaufmann, Burlington (2012)"},{"key":"39_CR19","doi-asserted-by":"crossref","unstructured":"Bellens, P., et\u00a0al.: Cellss: a programming model for the cell BE architecture. In: Proceedings of the ACM\/IEEE SC2006 Conference on High Performance Networking and Computing (2006)","DOI":"10.1109\/SC.2006.17"},{"key":"39_CR20","unstructured":"Bodin, F., Romain, D., Colin De\u00a0Verdiere, G.: One OpenCL to Rule Them All? In: International Workshop on Multi-\/Many-core Computing Systems, MuCoCoS (2013)"},{"key":"39_CR21","doi-asserted-by":"crossref","unstructured":"Boyer, M., et\u00a0al.: Improving GPU performance prediction with data transfer modeling. In: 2013 IEEE International Symposium on Parallel & Distributed Processing, Workshops and Phd Forum (2013)","DOI":"10.1109\/IPDPSW.2013.236"},{"key":"39_CR22","doi-asserted-by":"crossref","unstructured":"Breitbart, J., Fohry, C.: Opencl: an effective programming model for data parallel computations at the cell broadband engine. In: 24th IEEE International Symposium on Parallel and Distributed Processing, IPDPS (2010)","DOI":"10.1109\/IPDPSW.2010.5470823"},{"key":"39_CR23","first-page":"1","volume":"18","author":"AR Brodtkorb","year":"2010","unstructured":"Brodtkorb, A.R., et al.: State-of-the-art in heterogeneous computing. Sci. Program. 18, 1\u201333 (2010)","journal-title":"Sci. Program."},{"key":"39_CR24","doi-asserted-by":"publisher","first-page":"777","DOI":"10.1145\/1015706.1015800","volume":"23","author":"I Buck","year":"2004","unstructured":"Buck, I., et al.: Brook for GPUs: stream computing on graphics hardware. ACM Trans. Graph 23, 777\u2013786 (2004)","journal-title":"ACM Trans. Graph"},{"key":"39_CR25","doi-asserted-by":"crossref","unstructured":"Chandrasekhar, A., et\u00a0al.: IGC: the open source intel graphics compiler. In: IEEE\/ACM International Symposium on Code Generation and Optimization, CGO (2019)","DOI":"10.1109\/CGO.2019.8661189"},{"key":"39_CR26","doi-asserted-by":"crossref","unstructured":"Che, S., et\u00a0al.: Rodinia: A benchmark suite for heterogeneous computing. In: Proceedings of the 2009 IEEE International Symposium on Workload Characterization. IEEE Computer Society (2009)","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"39_CR27","unstructured":"Chen, T., et\u00a0al.: Mxnet: a flexible and efficient machine learning library for heterogeneous distributed systems. CoRR abs\/1512.01274 (2015)"},{"key":"39_CR28","doi-asserted-by":"publisher","first-page":"559","DOI":"10.1147\/rd.515.0559","volume":"51","author":"T Chen","year":"2007","unstructured":"Chen, T., et al.: Cell broadband engine architecture and its first implementation\u2014a performance view. IBM J. Res. Dev. 51, 559\u2013572 (2007)","journal-title":"IBM J. Res. Dev."},{"key":"39_CR29","doi-asserted-by":"publisher","first-page":"80","DOI":"10.1007\/s10766-019-00646-x","volume":"48","author":"D Chen","year":"2020","unstructured":"Chen, D., et al.: Characterizing scalability of sparse matrix-vector multiplications on phytium ft-2000+. Int. J. Parallel Program. 48, 80\u201397 (2020)","journal-title":"Int. J. Parallel Program."},{"key":"39_CR30","unstructured":"Ciechanowicz, P., et\u00a0al.: The m\u00fcnster skeleton library muesli: a comprehensive overview. Working Papers, ERCIS-European Research Center for Information Systems, No. 7 (2009)"},{"key":"39_CR31","unstructured":"Cole, M.I.: Algorithmic skeletons: structured management of parallel computation (1989)"},{"key":"39_CR32","unstructured":"Common-Shader Core.: https:\/\/docs.microsoft.com\/en-us\/windows\/win32\/direct3dhlsl\/dx-graphics-hlsl-common-core?redirectedfrom=MSDN (2018)"},{"key":"39_CR33","doi-asserted-by":"crossref","unstructured":"Copik, M., Kaiser, H.: Using SYCL as an implementation framework for hpx.compute. In: Proceedings of the 5th International Workshop on OpenCL, IWOCL (2017)","DOI":"10.1145\/3078155.3078187"},{"key":"39_CR34","doi-asserted-by":"crossref","unstructured":"Crawford, C.H., et\u00a0al.: Accelerating computing with the cell broadband engine processor. In: Proceedings of the 5th Conference on Computing Frontiers (2008)","DOI":"10.1145\/1366230.1366234"},{"key":"39_CR35","doi-asserted-by":"crossref","unstructured":"Cummins, C., et\u00a0al.: End-to-end deep learning of optimization heuristics. In: PACT (2017)","DOI":"10.1109\/PACT.2017.24"},{"key":"39_CR36","doi-asserted-by":"crossref","unstructured":"Cummins, C., et\u00a0al.: Synthesizing benchmarks for predictive modeling. In: CGO (2017)","DOI":"10.1109\/CGO.2017.7863731"},{"key":"39_CR37","doi-asserted-by":"publisher","first-page":"283","DOI":"10.1109\/TPDS.2017.2755657","volume":"29","author":"TT Dao","year":"2018","unstructured":"Dao, T.T., Lee, J.: An auto-tuner for opencl work-group size on GPUs. IEEE Trans. Parallel Distrib. Syst. 29, 283\u2013296 (2018)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"39_CR38","doi-asserted-by":"crossref","unstructured":"Dastgeer, U., et\u00a0al.: Adaptive implementation selection in the skepu skeleton programming library. In: Advanced Parallel Processing Technologies\u201410th International Symposium, APPT (2013)","DOI":"10.1007\/978-3-642-45293-2_13"},{"key":"39_CR39","doi-asserted-by":"publisher","first-page":"1023","DOI":"10.1007\/s11227-012-0789-3","volume":"62","author":"NE Davis","year":"2012","unstructured":"Davis, N.E., et al.: Paradigmatic shifts for exascale supercomputing. J. Supercomput. 62, 1023\u20131044 (2012)","journal-title":"J. Supercomput."},{"key":"39_CR40","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3132710","volume":"14","author":"D De Sensi","year":"2017","unstructured":"De Sensi, D., et al.: Bringing parallel patterns out of the corner: the p3 arsec benchmark suite. ACM Trans. Archit. Code Optim. (TACO) 14, 1\u201326 (2017)","journal-title":"ACM Trans. Archit. Code Optim. (TACO)"},{"key":"39_CR41","doi-asserted-by":"crossref","unstructured":"de\u00a0Carvalho Moreira, W., et\u00a0al.: Exploring heterogeneous mobile architectures with a high-level programming model. In: 29th International Symposium on Computer Architecture and High Performance Computing, SBAC-PAD (2017)","DOI":"10.1109\/SBAC-PAD.2017.11"},{"key":"39_CR42","unstructured":"de\u00a0Fine\u00a0Licht, J., Hoefler, T.: hlslib: Software engineering for hardware design. CoRR (2019)"},{"key":"39_CR43","doi-asserted-by":"publisher","unstructured":"Demidov, D., et\u00a0al.: ddemidov\/amgcl: 1.2.0 (2018). https:\/\/doi.org\/10.5281\/zenodo.1244532","DOI":"10.5281\/zenodo.1244532"},{"key":"39_CR44","doi-asserted-by":"publisher","unstructured":"Demidov, D., et\u00a0al.: ddemidov\/vexcl: 1.4.1 (2017). https:\/\/doi.org\/10.5281\/zenodo.571466","DOI":"10.5281\/zenodo.571466"},{"key":"39_CR45","doi-asserted-by":"publisher","first-page":"535","DOI":"10.1134\/S1995080219050056","volume":"40","author":"D Demidov","year":"2019","unstructured":"Demidov, D.: Amgcl: an efficient, flexible, and extensible algebraic multigrid implementation. Lobachevskii J. Math. 40, 535\u2013546 (2019)","journal-title":"Lobachevskii J. Math."},{"key":"39_CR46","unstructured":"Diamos, C., et\u00a0al.: Compiling a high-level language for GPUs: (via language support for architectures and compilers). In: ACM SIGPLAN Conference on Programming Language Design and Implementation, PLDI (2012)"},{"key":"39_CR47","doi-asserted-by":"crossref","unstructured":"Diamos, G.F., et\u00a0al.: Ocelot: a dynamic optimization framework for bulk-synchronous applications in heterogeneous systems. In: 19th International Conference on Parallel Architectures and Compilation Techniques, PACT (2010)","DOI":"10.1145\/1854273.1854318"},{"key":"39_CR48","unstructured":"Directcompute programming guide.: Tech. rep., NVIDIA Corporation (2010)"},{"key":"39_CR49","doi-asserted-by":"publisher","first-page":"173","DOI":"10.1142\/S0129626411000151","volume":"21","author":"A Duran","year":"2011","unstructured":"Duran, A., et al.: Ompss: a proposal for programming heterogeneous multi-core architectures. Parallel Process. Lett. 21, 173\u2013193 (2011)","journal-title":"Parallel Process. Lett."},{"key":"39_CR50","doi-asserted-by":"publisher","first-page":"3202","DOI":"10.1016\/j.jpdc.2014.07.003","volume":"74","author":"HC Edwards","year":"2014","unstructured":"Edwards, H.C., et al.: Kokkos: Enabling manycore performance portability through polymorphic memory access patterns. J. Parallel Distrib. Comput. 74, 3202\u20133216 (2014)","journal-title":"J. Parallel Distrib. Comput."},{"key":"39_CR51","doi-asserted-by":"crossref","unstructured":"Emani, M.K., et\u00a0al.: Smart, adaptive mapping of parallelism in the presence of external workload. In: CGO (2013)","DOI":"10.1109\/CGO.2013.6495010"},{"key":"39_CR52","doi-asserted-by":"publisher","first-page":"62","DOI":"10.1007\/s10766-017-0490-5","volume":"46","author":"A Ernstsson","year":"2018","unstructured":"Ernstsson, A., et al.: Skepu 2: flexible and type-safe skeleton programming for heterogeneous parallel systems. Int. J. Parallel Program. 46, 62\u201380 (2018)","journal-title":"Int. J. Parallel Program."},{"key":"39_CR53","doi-asserted-by":"crossref","unstructured":"Fang, J., et\u00a0al.: A comprehensive performance comparison of CUDA and opencl. In: ICPP (2011)","DOI":"10.1109\/ICPP.2011.45"},{"key":"39_CR54","doi-asserted-by":"crossref","unstructured":"Fang, J., et\u00a0al.: Implementing and evaluating opencl on an armv8 multi-core CPU. In: 2017 IEEE International Symposium on Parallel and Distributed Processing with Applications and 2017 IEEE International Conference on Ubiquitous Computing and Communications (ISPA\/IUCC) (2017)","DOI":"10.1109\/ISPA\/IUCC.2017.00131"},{"key":"39_CR55","doi-asserted-by":"crossref","unstructured":"Fang, J., et\u00a0al.: Test-driving intel xeon phi. In: ACM\/SPEC International Conference on Performance Engineering (ICPE), pp. 137\u2013148 (2014)","DOI":"10.1145\/2568088.2576799"},{"key":"39_CR56","unstructured":"Fang, J.: Towards a systematic exploration of the optimization space for many-core processors. Ph.D. thesis, Delft University of Technology, Netherlands (2014)"},{"issue":"4","key":"39_CR57","doi-asserted-by":"publisher","first-page":"1640002","DOI":"10.1142\/S0129626416400028","volume":"26","author":"J Fang","year":"2016","unstructured":"Fang, J., et al.: Evaluating multiple streams on heterogeneous platforms. Parallel Process. Lett. 26(4), 1640002 (2016)","journal-title":"Parallel Process. Lett."},{"key":"39_CR58","unstructured":"FreeOCL.: http:\/\/www.zuzuf.net\/FreeOCL\/ (2020)"},{"key":"39_CR59","unstructured":"GalliumCompute.: https:\/\/dri.freedesktop.org\/wiki \/GalliumCompute\/ (2020)"},{"key":"39_CR60","unstructured":"GalliumCompute.: https:\/\/github.com\/intel\/compute-runtime (2020)"},{"key":"39_CR61","doi-asserted-by":"publisher","first-page":"769","DOI":"10.1016\/j.parco.2013.09.003","volume":"39","author":"MK Gardner","year":"2013","unstructured":"Gardner, M.K., et al.: Characterizing the challenges and evaluating the efficacy of a cuda-to-opencl translator. Parallel Comput. 39, 769\u2013786 (2013)","journal-title":"Parallel Comput."},{"key":"39_CR62","doi-asserted-by":"crossref","unstructured":"Giles, M.B., et\u00a0al.: Performance analysis of the OP2 framework on many-core architectures. SIGMETRICS Performance Evaluation Review (2011)","DOI":"10.1145\/1964218.1964221"},{"key":"39_CR63","doi-asserted-by":"publisher","first-page":"1117","DOI":"10.1016\/j.jpdc.2011.07.011","volume":"72","author":"J G\u00f3mez-Luna","year":"2012","unstructured":"G\u00f3mez-Luna, J., et al.: Performance models for asynchronous data transfers on consumer graphics processing units. J. Parallel Distrib. Comput. 72, 1117\u20131126 (2012)","journal-title":"J. Parallel Distrib. Comput."},{"key":"39_CR64","doi-asserted-by":"crossref","unstructured":"Govindaraju, N.K., et\u00a0al.: High performance discrete fourier transforms on graphics processors. In: Proceedings of the ACM\/IEEE Conference on High Performance Computing, SC (2008)","DOI":"10.1109\/SC.2008.5213922"},{"key":"39_CR65","doi-asserted-by":"crossref","unstructured":"Grasso, I., et\u00a0al.: Energy efficient HPC on embedded socs: optimization techniques for mali GPU. In: 2014 IEEE 28th International Parallel and Distributed Processing Symposium, IPDPS (2014)","DOI":"10.1109\/IPDPS.2014.24"},{"key":"39_CR66","unstructured":"Green500 Supercomputers.: https:\/\/www.top500.org\/green500\/ (2020)"},{"key":"39_CR67","doi-asserted-by":"crossref","unstructured":"Gregg, C., et\u00a0al.: Where is the data? why you cannot debate CPU vs. GPU performance without the answer. In: IEEE International Symposium on Performance Analysis of Systems and Software, ISPASS (2011)","DOI":"10.1109\/ISPASS.2011.5762730"},{"key":"39_CR68","unstructured":"Gregory, K., Miller, A.: C++ AMP: accelerated massive parallelism with microsoft visual C++ (2012)"},{"key":"39_CR69","doi-asserted-by":"crossref","unstructured":"Grewe, D., et\u00a0al.: Opencl task partitioning in the presence of GPU contention. In: LCPC (2013a)","DOI":"10.1007\/978-3-319-09967-5_5"},{"key":"39_CR70","doi-asserted-by":"crossref","unstructured":"Grewe, D., et\u00a0al.: Portable mapping of data parallel programs to opencl for heterogeneous systems. In: CGO (2013b)","DOI":"10.1109\/CGO.2013.6494993"},{"key":"39_CR71","doi-asserted-by":"publisher","first-page":"10","DOI":"10.1109\/MM.2006.41","volume":"26","author":"M Gschwind","year":"2006","unstructured":"Gschwind, M., et al.: Synergistic processing in cell\u2019s multicore architecture. IEEE Micro 26, 10\u201324 (2006)","journal-title":"IEEE Micro"},{"key":"39_CR72","doi-asserted-by":"crossref","unstructured":"Haidl, M., et\u00a0al.: Pacxxv2 + RV: an llvm-based portable high-performance programming model. In: Proceedings of the Fourth Workshop on the LLVM Compiler Infrastructure in HPC, LLVM-HPC@SC (2017)","DOI":"10.1145\/3148173.3148185"},{"key":"39_CR73","doi-asserted-by":"crossref","unstructured":"Haidl, M., Gorlatch, S.: PACXX: towards a unified programming model for programming accelerators using C++14. In: Proceedings of the 2014 LLVM Compiler Infrastructure in HPC, LLVM (2014)","DOI":"10.1109\/LLVM-HPC.2014.9"},{"key":"39_CR74","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1007\/s10766-017-0497-y","volume":"46","author":"M Haidl","year":"2018","unstructured":"Haidl, M., Gorlatch, S.: High-level programming for many-cores using C++14 and the STL. Int. J. Parallel Program. 46, 23\u201341 (2018)","journal-title":"Int. J. Parallel Program."},{"key":"39_CR75","doi-asserted-by":"crossref","unstructured":"Han, T.D., et\u00a0al.: hicuda: a high-level directive-based language for GPU programming. In: Proceedings of 2nd Workshop on General Purpose Processing on Graphics Processing Units, GPGPU, ACM International Conference Proceeding Series (2009)","DOI":"10.1145\/1513895.1513902"},{"key":"39_CR76","doi-asserted-by":"publisher","first-page":"78","DOI":"10.1109\/TPDS.2010.62","volume":"22","author":"TD Han","year":"2011","unstructured":"Han, T.D., et al.: hicuda: high-level GPGPU programming. IEEE Trans. Parallel Distrib. Syst. 22, 78\u201390 (2011)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"39_CR77","unstructured":"Harris, M.J., et\u00a0al.: Simulation of cloud dynamics on graphics hardware. In: Proceedings of the 2003 ACM SIGGRAPH\/EUROGRAPHICS Workshop on Graphics Hardware (2003)"},{"key":"39_CR78","doi-asserted-by":"publisher","first-page":"1093","DOI":"10.1016\/j.cpc.2010.12.052","volume":"182","author":"MJ Harvey","year":"2011","unstructured":"Harvey, M.J., et al.: Swan: a tool for porting CUDA programs to opencl. Comput. Phys. Commun. 182, 1093\u20131099 (2011)","journal-title":"Comput. Phys. Commun."},{"key":"39_CR79","unstructured":"HCC.: Heterogeneous Compute Compiler. https:\/\/gpuopen.com\/compute-product\/hcc-heterogeneous-compute-compiler\/ (2020)"},{"key":"39_CR80","doi-asserted-by":"crossref","unstructured":"He, J., et\u00a0al.: Openmdsp: Extending openmp to program multi-core DSP. In: 2011 International Conference on Parallel Architectures and Compilation Techniques, PACT (2011)","DOI":"10.1109\/PACT.2011.60"},{"key":"39_CR81","unstructured":"Heler, T., et\u00a0al.: Hpx\u2014an open source c++ standard library for parallelism and concurrency. In: OpenSuCo (2017)"},{"key":"39_CR82","doi-asserted-by":"crossref","unstructured":"Heller, T., et\u00a0al.: Closing the performance gap with modern C++. In: High Performance Computing - ISC High Performance 2016 International Workshops, ExaComm, E-MuCoCoS, HPC-IODC, IXPUG, IWOPH, P$$\\wedge$$3MA, VHPC, WOPSSS (2016)","DOI":"10.1007\/978-3-319-46079-6_2"},{"key":"39_CR83","doi-asserted-by":"crossref","unstructured":"Heller, T., et\u00a0al.: Using HPX and libgeodecomp for scaling HPC applications on heterogeneous supercomputers. In: Proceedings of the Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems, ScalA (2013)","DOI":"10.1145\/2530268.2530269"},{"key":"39_CR84","unstructured":"HIP.: Heterogeneous-Compute Interface for Portability. https:\/\/github.com\/RadeonOpenCompute\/hcc (2020)"},{"key":"39_CR85","unstructured":"High-level abstractions for performance.: Portability and continuity of scientific software on future computing systems. University of Oxford, Tech. rep. (2014)"},{"key":"39_CR86","unstructured":"HLSL.: The High Level Shading Language for DirectX. https:\/\/docs.microsoft.com\/en-us\/windows\/win32\/direct3dhlsl\/dx-graphics-hlsl (2018)"},{"key":"39_CR87","doi-asserted-by":"crossref","unstructured":"Hong, S., et\u00a0al.: Accelerating CUDA graph algorithms at maximum warp. In: Proceedings of the 16th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP (2011)","DOI":"10.1145\/1941553.1941590"},{"key":"39_CR88","doi-asserted-by":"crossref","unstructured":"Hong, S., et\u00a0al.: Green-marl: a DSL for easy and efficient graph analysis. In: Proceedings of the 17th International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS (2012)","DOI":"10.1145\/2150976.2151013"},{"key":"39_CR89","unstructured":"Intel Inc.: hStreams Architecture for MPSS 3.5 (2015)"},{"key":"39_CR90","unstructured":"Intel Manycore Platform Software Stack.: https:\/\/software.intel.com\/en-us\/articles\/intel-manycore-platform-software-stack-mpss (2020)"},{"key":"39_CR91","unstructured":"Intel\u2019s OneAPI.: https:\/\/software.intel.com\/en-us\/oneapi (2020)"},{"key":"39_CR92","unstructured":"Introducing rdna architecture.: Tech. rep., AMD Corporation (2019)"},{"key":"39_CR93","doi-asserted-by":"publisher","first-page":"752","DOI":"10.1007\/s10766-014-0320-y","volume":"43","author":"P J\u00e4\u00e4skel\u00e4inen","year":"2015","unstructured":"J\u00e4\u00e4skel\u00e4inen, P., et al.: pocl: a performance-portable opencl implementation. Int. J. Parallel Programm. 43, 752\u2013785 (2015)","journal-title":"Int. J. Parallel Programm."},{"key":"39_CR94","doi-asserted-by":"publisher","first-page":"589","DOI":"10.1147\/rd.494.0589","volume":"49","author":"JA Kahle","year":"2005","unstructured":"Kahle, J.A., et al.: Introduction to the cell multiprocessor. IBM J. Res. Dev. 49, 589\u2013604 (2005)","journal-title":"IBM J. Res. Dev."},{"key":"39_CR95","doi-asserted-by":"publisher","first-page":"563","DOI":"10.1145\/321406.321418","volume":"14","author":"RM Karp","year":"1967","unstructured":"Karp, R.M., et al.: The organization of computations for uniform recurrence equations. J. ACM (JACM) 14, 563\u2013590 (1967)","journal-title":"J. ACM (JACM)"},{"key":"39_CR96","doi-asserted-by":"crossref","unstructured":"Kim, J., et\u00a0al.: Bridging opencl and CUDA: a comparative analysis and translation. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC (2015)","DOI":"10.1145\/2807591.2807621"},{"key":"39_CR97","doi-asserted-by":"crossref","unstructured":"Kim, J., et\u00a0al.: Snucl: an opencl framework for heterogeneous CPU\/GPU clusters. In: International Conference on Supercomputing, ICS (2012)","DOI":"10.1145\/2304576.2304623"},{"key":"39_CR98","doi-asserted-by":"crossref","unstructured":"Kim, Y., et\u00a0al.: Translating CUDA to opencl for hardware generation using neural machine translation. In: IEEE\/ACM International Symposium on Code Generation and Optimization, CGO (2019)","DOI":"10.1109\/CGO.2019.8661172"},{"key":"39_CR99","doi-asserted-by":"crossref","unstructured":"Kim, J., et\u00a0al.: Translating openmp device constructs to opencl using unnecessary data transfer elimination. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC (2016)","DOI":"10.1109\/SC.2016.50"},{"key":"39_CR100","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1109\/MS.2011.12","volume":"28","author":"W Kim","year":"2011","unstructured":"Kim, W., Voss, M.: Multicore desktop programming with intel threading building blocks. IEEE Softw. 28, 23\u201331 (2011)","journal-title":"IEEE Softw."},{"key":"39_CR101","doi-asserted-by":"crossref","unstructured":"Kistler, M., et\u00a0al.: Petascale computing with accelerators. In: D.A. Reed, V.\u00a0Sarkar (eds.) Proceedings of the 14th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP (2009)","DOI":"10.1145\/1504176.1504212"},{"key":"39_CR102","doi-asserted-by":"crossref","unstructured":"Komoda, T., et\u00a0al.: Integrating multi-gpu execution in an openacc compiler. In: 42nd International Conference on Parallel Processing, ICPP (2013)","DOI":"10.1109\/ICPP.2013.35"},{"key":"39_CR103","unstructured":"Komornicki, A., et\u00a0al.: Roadrunner: hardware and software overview (2009)"},{"key":"39_CR104","doi-asserted-by":"crossref","unstructured":"Kr\u00fcger, J.H., Westermann, R.: Linear algebra operators for GPU implementation of numerical algorithms. ACM Trans. Graph (2003)","DOI":"10.1145\/1201775.882363"},{"key":"39_CR105","doi-asserted-by":"crossref","unstructured":"Kudlur, M., et\u00a0al.: Orchestrating the execution of stream programs on multicore platforms. In: Proceedings of the ACM SIGPLAN 2008 Conference on Programming Language Design and Implementation, PLDI (2008)","DOI":"10.1145\/1375581.1375596"},{"key":"39_CR106","doi-asserted-by":"crossref","unstructured":"Lee, S., Eigenmann, R.: Openmp to GPGPU: a compiler framework for automatic translation and optimization. In: Proceedings of the 14th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP (2009)","DOI":"10.1145\/1504176.1504194"},{"key":"39_CR107","doi-asserted-by":"crossref","unstructured":"Lee, S., Eigenmann, R.: Openmpc: Extended openmp programming and tuning for GPUs. In: Conference on High Performance Computing Networking, Storage and Analysis, SC (2010)","DOI":"10.1109\/SC.2010.36"},{"key":"39_CR108","doi-asserted-by":"crossref","unstructured":"Lee, V.W., et\u00a0al.: Debunking the 100x GPU vs. CPU myth: an evaluation of throughput computing on CPU and GPU. In: 37th International Symposium on Computer Architecture, ISCA (2010)","DOI":"10.1145\/1815961.1816021"},{"key":"39_CR109","doi-asserted-by":"crossref","unstructured":"Lepley, T., et\u00a0al.: A novel compilation approach for image processing graphs on a many-core platform with explicitly managed memory. In: International Conference on Compilers, Architecture and Synthesis for Embedded Systems, CASES (2013)","DOI":"10.1109\/CASES.2013.6662510"},{"key":"39_CR110","doi-asserted-by":"crossref","unstructured":"Leung, A., et\u00a0al.: A mapping path for multi-gpgpu accelerated computers from a portable high level programming abstraction. In: Proceedings of 3rd Workshop on General Purpose Processing on Graphics Processing Units, GPGPU, ACM International Conference Proceeding Series (2010)","DOI":"10.1145\/1735688.1735698"},{"key":"39_CR111","doi-asserted-by":"crossref","unstructured":"Li, Z., et\u00a0al.: Evaluating the performance impact of multiple streams on the mic-based heterogeneous platform. In: 2016 IEEE International Parallel and Distributed Processing Symposium Workshops, IPDPS Workshops (2016a)","DOI":"10.1109\/IPDPSW.2016.99"},{"key":"39_CR112","doi-asserted-by":"crossref","unstructured":"Li, Z., et\u00a0al.: Streaming applications on heterogeneous platforms. In: Network and Parallel Computing\u201413th IFIP WG 10.3 International Conference, NPC (2016b)","DOI":"10.1007\/978-3-319-47099-3_10"},{"key":"39_CR113","first-page":"1236","volume":"19","author":"X Liao","year":"2018","unstructured":"Liao, X., et al.: Moving from exascale to zettascale computing: challenges and techniques. Front. IT EE 19, 1236\u20131244 (2018)","journal-title":"Front. IT EE"},{"key":"39_CR114","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1109\/MM.2008.31","volume":"28","author":"E Lindholm","year":"2008","unstructured":"Lindholm, E., et al.: NVIDIA tesla: a unified graphics and computing architecture. IEEE Micro 28, 39\u201355 (2008)","journal-title":"IEEE Micro"},{"key":"39_CR115","doi-asserted-by":"crossref","unstructured":"Liu, B., et\u00a0al.: Software pipelining for graphic processing unit acceleration: partition, scheduling and granularity. IJHPCA (2016)","DOI":"10.1177\/1094342015585845"},{"key":"39_CR116","doi-asserted-by":"crossref","unstructured":"Marco, V.S., et\u00a0al.: Improving spark application throughput via memory aware task co-location: a mixture of experts approach. In: Middleware (2017)","DOI":"10.1145\/3135974.3135984"},{"key":"39_CR117","doi-asserted-by":"crossref","unstructured":"Mark, W.R., et al.: Cg: a system for programming graphics hardware in a c-like language. ACM Trans. Graph (2003)","DOI":"10.1145\/1201775.882362"},{"key":"39_CR118","doi-asserted-by":"crossref","unstructured":"Marqu\u00e9s, R., et\u00a0al.: Algorithmic skeleton framework for the orchestration of GPU computations. In: Euro-Par 2013 Parallel Processing, Lecture Notes in Computer Science (2013)","DOI":"10.1007\/978-3-642-40047-6_86"},{"key":"39_CR119","doi-asserted-by":"crossref","unstructured":"Martinez, G., et\u00a0al.: CU2CL: A cuda-to-opencl translator for multi- and many-core architectures. In: 17th IEEE International Conference on Parallel and Distributed Systems, ICPADS (2011)","DOI":"10.1109\/ICPADS.2011.48"},{"key":"39_CR120","doi-asserted-by":"crossref","unstructured":"Membarth, R., et al.: Hipa$${}^{\\text{cc}}$$: A domain-specific language and compiler for image processing. IEEE Trans. Parallel Distrib. Syst (2016)","DOI":"10.1109\/TPDS.2015.2394802"},{"key":"39_CR121","doi-asserted-by":"crossref","unstructured":"Membarth, R., et\u00a0al.: Generating device-specific GPU code for local operators in medical imaging. In: 26th IEEE International Parallel and Distributed Processing Symposium, IPDPS (2012)","DOI":"10.1109\/IPDPS.2012.59"},{"key":"39_CR122","doi-asserted-by":"crossref","unstructured":"Mendonca, G.S.D., et\u00a0al.: Dawncc: Automatic annotation for data parallelism and offloading. TACO (2017)","DOI":"10.1145\/3084540"},{"key":"39_CR123","doi-asserted-by":"crossref","unstructured":"Merrill, D., et\u00a0al.: Scalable GPU graph traversal. In: Proceedings of the 17th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP (2012)","DOI":"10.1145\/2145816.2145832"},{"key":"39_CR124","doi-asserted-by":"crossref","unstructured":"Meswani, M.R., et\u00a0al.: Modeling and predicting performance of high performance computing applications on hardware accelerators. IJHPCA (2013)","DOI":"10.1109\/IPDPSW.2012.226"},{"key":"39_CR125","doi-asserted-by":"crossref","unstructured":"Mishra, A., et\u00a0al.: Kernel fusion\/decomposition for automatic gpu-offloading. In: IEEE\/ACM International Symposium on Code Generation and Optimization, CGO (2019)","DOI":"10.1109\/CGO.2019.8661188"},{"key":"39_CR126","unstructured":"MPI.: Message Passing Interface. https:\/\/computing.llnl.gov\/tutorials\/mpi\/ (2020)"},{"key":"39_CR127","doi-asserted-by":"crossref","unstructured":"Muralidharan, S., et\u00a0al.: Architecture-adaptive code variant tuning. In: Proceedings of the Twenty-First International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS (2016)","DOI":"10.1145\/2872362.2872411"},{"key":"39_CR128","doi-asserted-by":"crossref","unstructured":"Newburn, C.J., et\u00a0al.: Heterogeneous streaming. In: IPDPSW (2016)","DOI":"10.1109\/IPDPSW.2016.217"},{"key":"39_CR129","doi-asserted-by":"crossref","unstructured":"Nomizu, T., et\u00a0al.: Implementation of xcalablemp device acceleration extention with opencl. In: 26th IEEE International Parallel and Distributed Processing Symposium Workshops & PhD Forum, IPDPSWP (2012)","DOI":"10.1109\/IPDPSW.2012.296"},{"key":"39_CR130","doi-asserted-by":"crossref","unstructured":"Nugteren, C., Corporaal, H.: Introducing \u2019bones\u2019: a parallelizing source-to-source compiler based on algorithmic skeletons. In: The 5th Annual Workshop on General Purpose Processing with Graphics Processing Units, GPGPU (2012)","DOI":"10.1145\/2159430.2159431"},{"key":"39_CR131","unstructured":"NVIDIA CUDA Toolkit.: https:\/\/developer.nvidia.com\/cuda-toolkit (2020)"},{"key":"39_CR132","unstructured":"Nvidia geforce gtx 980.: Tech. rep., NVIDIA Corporation (2014)"},{"key":"39_CR133","unstructured":"Nvidia tesla p100.: Tech. rep., NVIDIA Corporation (2016)"},{"key":"39_CR134","unstructured":"Nvidia tesla v100 gpu architecture.: Tech. rep., NVIDIA Corporation (2017)"},{"key":"39_CR135","unstructured":"Nvidia turing gpu architecture.: Tech. rep., NVIDIA Corporation (2018)"},{"key":"39_CR136","unstructured":"Nvidia\u2019s next generation cuda compute architecture.: Fermi. NVIDIA Corporation, Tech. rep. (2009)"},{"key":"39_CR137","unstructured":"Nvidia\u2019s next generation cuda compute architecture.: Kepler tm gk110\/210. NVIDIA Corporation, Tech. rep. (2014)"},{"key":"39_CR138","doi-asserted-by":"publisher","first-page":"289","DOI":"10.1007\/s10766-008-0072-7","volume":"36","author":"K O\u2019Brien","year":"2008","unstructured":"O\u2019Brien, K., et al.: Supporting openmp on cell. Int. J. Parallel Program. 36, 289\u2013311 (2008)","journal-title":"Int. J. Parallel Program."},{"key":"39_CR139","doi-asserted-by":"crossref","unstructured":"Ogilvie, W.F., et\u00a0al.: Fast automatic heuristic construction using active learning. In: LCPC (2014)","DOI":"10.1007\/978-3-319-17473-0_10"},{"key":"39_CR140","unstructured":"OpenCL.: The open standard for parallel programming of heterogeneous systems. http:\/\/www.khronos.org\/opencl\/ (2020)"},{"key":"39_CR141","doi-asserted-by":"crossref","unstructured":"Owens, J.D., et al.: GPU computing. Proceedings of the IEEE (2008)","DOI":"10.1109\/JPROC.2008.917757"},{"key":"39_CR142","unstructured":"Owens, J.D., et\u00a0al.: A survey of general-purpose computation on graphics hardware. In: Eurographics, pp. 21\u201351 (2005)"},{"key":"39_CR143","unstructured":"Parallel Patterns Library.: https:\/\/docs.microsoft.com\/en-us\/cpp\/parallel\/concrt\/parallel-patterns-library-ppl?view=vs-2019 (2016)"},{"key":"39_CR144","unstructured":"Paszke, A., et\u00a0al.: Pytorch: an imperative style, high-performance deep learning library. In: Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS, pp. 8024\u20138035 (2019)"},{"key":"39_CR145","doi-asserted-by":"crossref","unstructured":"Patterson, D.A.: 50 years of computer architecture: From the mainframe CPU to the domain-specific tpu and the open RISC-V instruction set. In: 2018 IEEE International Solid-State Circuits Conference, ISSCC (2018)","DOI":"10.1109\/ISSCC.2018.8310168"},{"key":"39_CR146","unstructured":"PGI CUDA C\/C++ for x86.: https:\/\/developer.nvidia.com\/pgi-cuda-cc-x86 (2020)"},{"key":"39_CR147","unstructured":"Pham, D., et\u00a0al.: The design methodology and implementation of a first-generation CELL processor: a multi-core soc. In: Proceedings of the IEEE 2005 Custom Integrated Circuits Conference, CICC (2005)"},{"key":"39_CR148","unstructured":"PIPS.: Automatic Parallelizer and Code Transformation Framework. https:\/\/pips4u.org\/ (2020)"},{"key":"39_CR149","unstructured":"Qualcomm snapdragon mobile platform opencl general programming and optimization.: Tech. rep., Qualcomm Corporation (2017)"},{"key":"39_CR150","doi-asserted-by":"crossref","unstructured":"Ragan-Kelley, J., et al.: Decoupling algorithms from schedules for easy optimization of image processing pipelines. ACM Trans. Graph (2012)","DOI":"10.1145\/2185520.2185528"},{"key":"39_CR151","doi-asserted-by":"crossref","unstructured":"Ragan-Kelley, J., et\u00a0al.: Halide: a language and compiler for optimizing parallelism, locality, and recomputation in image processing pipelines. In: ACM SIGPLAN Conference on Programming Language Design and Implementation, PLDI (2013)","DOI":"10.1145\/2491956.2462176"},{"key":"39_CR152","doi-asserted-by":"crossref","unstructured":"Ravi, N., et\u00a0al.: Apricot: an optimizing compiler and productivity tool for x86-compatible many-core coprocessors. In: International Conference on Supercomputing, ICS (2012)","DOI":"10.1145\/2304576.2304585"},{"key":"39_CR153","doi-asserted-by":"crossref","unstructured":"Ren, J., et\u00a0al.: Camel: Smart, adaptive energy optimization for mobile web interactions. In: IEEE Conference on Computer Communications (INFOCOM) (2020)","DOI":"10.1109\/INFOCOM41043.2020.9155489"},{"key":"39_CR154","doi-asserted-by":"crossref","unstructured":"Ren, J., et\u00a0al.: Optimise web browsing on heterogeneous mobile platforms: a machine learning based approach. In: INFOCOM (2017)","DOI":"10.1109\/INFOCOM.2017.8057087"},{"key":"39_CR155","doi-asserted-by":"crossref","unstructured":"Ren, J., et\u00a0al.: Proteus: Network-aware web browsing on heterogeneous mobile systems. In: CoNEXT \u201918 (2018)","DOI":"10.1145\/3281411.3281422"},{"key":"39_CR156","unstructured":"Renderscript Compute.: http:\/\/developer.android.com\/guide\/topics\/renderscript\/compute.html (2020)"},{"key":"39_CR157","unstructured":"ROCm Runtime.: https:\/\/github.com\/RadeonOpenCompute \/ROCR-Runtime (2020)"},{"key":"39_CR158","unstructured":"ROCm.: A New Era in Open GPU Computing. https:\/\/www.amd.com\/en\/graphics\/servers-solutions-rocm-hpc (2020)"},{"key":"39_CR159","doi-asserted-by":"crossref","unstructured":"Rudy, G., et\u00a0al.: A programming language interface to describe transformations and code generation. In: Languages and Compilers for Parallel Computing - 23rd International Workshop, LCPC (2010)","DOI":"10.1007\/978-3-642-19595-2_10"},{"key":"39_CR160","first-page":"1","volume":"19","author":"V Sanz Marco","year":"2019","unstructured":"Sanz Marco, V., et al.: Optimizing deep learning inference on embedded systems through adaptive model selection. ACM Trans. Embed. Comput. 19, 1\u201328 (2019)","journal-title":"ACM Trans. Embed. Comput."},{"key":"39_CR161","doi-asserted-by":"crossref","unstructured":"Sathre, P., et\u00a0al.: On the portability of cpu-accelerated applications via automated source-to-source translation. In: Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region, HPC Asia (2019)","DOI":"10.1145\/3293320.3293338"},{"key":"39_CR162","doi-asserted-by":"publisher","first-page":"1381","DOI":"10.1109\/TPDS.2007.70811","volume":"19","author":"DP Scarpazza","year":"2008","unstructured":"Scarpazza, D.P., et al.: Efficient breadth-first search on the cell\/be processor. IEEE Trans. Parallel Distrib. Syst. 19, 1381\u20131395 (2008)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"39_CR163","first-page":"1","volume":"27","author":"L Seiler","year":"2009","unstructured":"Seiler, L., et al.: Larrabee: a many-core x86 architecture for visual computing. IEEE Micro 27, 1\u201315 (2009)","journal-title":"IEEE Micro"},{"key":"39_CR164","doi-asserted-by":"crossref","unstructured":"Sidelnik, A., et\u00a0al.: Performance portability with the chapel language. In: 26th IEEE International Parallel and Distributed Processing Symposium, IPDPS, pp. 582\u2013594 (2012)","DOI":"10.1109\/IPDPS.2012.60"},{"key":"39_CR165","doi-asserted-by":"crossref","unstructured":"Steinkrau, D., et\u00a0al.: Using GPUs for machine learning algorithms. In: Eighth International Conference on Document Analysis and Recognition (ICDAR. IEEE Computer Society (2005)","DOI":"10.1109\/ICDAR.2005.251"},{"key":"39_CR166","doi-asserted-by":"crossref","unstructured":"Steuwer, M., et\u00a0al.: Skelcl\u2014a portable skeleton library for high-level GPU programming. In: 25th IEEE International Symposium on Parallel and Distributed Processing, IPDPS (2011)","DOI":"10.1109\/IPDPS.2011.269"},{"key":"39_CR167","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1007\/s11227-014-1213-y","volume":"69","author":"M Steuwer","year":"2014","unstructured":"Steuwer, M., Gorlatch, S.: Skelcl: a high-level extension of opencl for multi-gpu systems. J. Supercomput. 69, 23\u201325 (2014)","journal-title":"J. Supercomput."},{"key":"39_CR168","doi-asserted-by":"crossref","unstructured":"Stratton, J.A., et\u00a0al.: MCUDA: an efficient implementation of CUDA kernels for multi-core CPUs. In: Languages and Compilers for Parallel Computing, 21th International Workshop, LCPC (2008)","DOI":"10.1007\/978-3-540-89740-8_2"},{"key":"39_CR169","unstructured":"Sycl integrates opencl devices with modern c++.: Tech. Rep. version 1.2.1 revison 6, The Khronos Group (2019)"},{"key":"39_CR170","doi-asserted-by":"crossref","unstructured":"Szuppe, J.: Boost.compute: A parallel computing library for C++ based on opencl. In: Proceedings of the 4th International Workshop on OpenCL, IWOCL (2016)","DOI":"10.1145\/2909437.2909454"},{"key":"39_CR171","doi-asserted-by":"crossref","unstructured":"Taylor, B., et\u00a0al.: Adaptive optimization for opencl programs on embedded heterogeneous systems. In: LCTES (2017)","DOI":"10.1145\/3078633.3081040"},{"key":"39_CR172","unstructured":"The Aurora Supercomputer.: https:\/\/aurora.alcf.anl.gov\/ (2020)"},{"key":"39_CR173","unstructured":"The El Capitan Supercomputer.: https:\/\/www.cray.com\/company\/customers\/lawrence-livermore-national-lab (2020)"},{"key":"39_CR174","unstructured":"The Frontier Supercomputer.: https:\/\/www.olcf.ornl.gov\/frontier\/ (2020)"},{"key":"39_CR175","unstructured":"The OpenACC API specification for parallel programming.: https:\/\/www.openacc.org\/ (2020)"},{"key":"39_CR176","unstructured":"The OpenCL Conformance Tests.: https:\/\/github.com\/KhronosGroup\/OpenCL-CTS (2020)"},{"key":"39_CR177","unstructured":"The OpenMP API specification for parallel programming.: https:\/\/www.openmp.org\/ (2020)"},{"key":"39_CR178","unstructured":"The Tianhe-2 Supercomputer.: https:\/\/top500.org\/system\/177999 (2020)"},{"key":"39_CR179","unstructured":"TI\u2019s OpenCL Implementation.: https:\/\/git.ti.com\/cgit\/opencl (2020)"},{"key":"39_CR180","doi-asserted-by":"publisher","first-page":"232","DOI":"10.1016\/j.parco.2009.12.005","volume":"36","author":"S Tomov","year":"2010","unstructured":"Tomov, S., et al.: Towards dense linear algebra for hybrid GPU accelerated manycore systems. Parallel Comput. 36, 232\u2013240 (2010)","journal-title":"Parallel Comput."},{"key":"39_CR181","unstructured":"Top500 Supercomputers.: https:\/\/www.top500.org\/ (2020)"},{"key":"39_CR182","doi-asserted-by":"publisher","first-page":"177","DOI":"10.1145\/1543135.1542496","volume":"44","author":"G Tournavitis","year":"2009","unstructured":"Tournavitis, G., et al.: Towards a holistic approach to auto-parallelization: integrating profile-driven parallelism detection and machine-learning based mapping. ACM Sigplan Not. 44, 177\u2013187 (2009)","journal-title":"ACM Sigplan Not."},{"key":"39_CR183","unstructured":"Trevett, N.: Opencl, sycl and spir\u2014the next steps. Tech. rep, OpenCL Working Group (2019)"},{"key":"39_CR184","doi-asserted-by":"crossref","unstructured":"Ueng, S., et\u00a0al.: Cuda-lite: Reducing GPU programming complexity. In: J.N. Amaral (ed.) Languages and Compilers for Parallel Computing, 21th International Workshop, LCPC (2008)","DOI":"10.1007\/978-3-540-89740-8_1"},{"key":"39_CR185","doi-asserted-by":"crossref","unstructured":"Unat, D., et\u00a0al.: Mint: realizing CUDA performance in 3d stencil methods with annotated C. In: Proceedings of the 25th International Conference on Supercomputing, (2011)","DOI":"10.1145\/1995896.1995932"},{"key":"39_CR186","doi-asserted-by":"crossref","unstructured":"van Werkhoven, B., et\u00a0al.: Performance models for CPU-GPU data transfers. In: 14th IEEE\/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGrid) (2014)","DOI":"10.1109\/CCGrid.2014.16"},{"key":"39_CR1","unstructured":"\u201cvega\u201d instruction set architecture.: Tech. rep., AMD Corporation (2017)"},{"key":"39_CR187","doi-asserted-by":"crossref","unstructured":"Verdoolaege, S., et\u00a0al.: Polyhedral parallel code generation for CUDA. ACM TACO (2013)","DOI":"10.1145\/2400682.2400713"},{"key":"39_CR188","doi-asserted-by":"publisher","first-page":"1627","DOI":"10.1016\/j.jpdc.2013.07.013","volume":"73","author":"M Vi\u00f1as","year":"2013","unstructured":"Vi\u00f1as, M., et al.: Exploiting heterogeneous parallelism with the heterogeneous programming library. J. Parallel Distrib. Comput. 73, 1627\u20131638 (2013)","journal-title":"J. Parallel Distrib. Comput."},{"key":"39_CR189","doi-asserted-by":"publisher","first-page":"e4664","DOI":"10.1002\/cpe.4664","volume":"20","author":"M Vi\u00f1as","year":"2018","unstructured":"Vi\u00f1as, M., et al.: Heterogeneous distributed computing based on high-level abstractions. Pract. Exp. Concurr. Comput. 20, e4664 (2018)","journal-title":"Pract. Exp. Concurr. Comput."},{"key":"39_CR190","doi-asserted-by":"crossref","unstructured":"Wang, Z., et\u00a0al.: Automatic and portable mapping of data parallel programs to opencl for gpu-based heterogeneous systems. ACM TACO (2015)","DOI":"10.1145\/2677036"},{"key":"39_CR191","doi-asserted-by":"crossref","unstructured":"Wang, Z., et\u00a0al.: Exploitation of GPUs for the parallelisation of probably parallel legacy code. In: CC \u201914 (2014a)","DOI":"10.1007\/978-3-642-54807-9_9"},{"key":"39_CR192","doi-asserted-by":"crossref","unstructured":"Wang, Z., et\u00a0al.: Integrating profile-driven parallelism detection and machine-learning-based mapping. ACM TACO (2014b)","DOI":"10.1145\/2579561"},{"key":"39_CR193","doi-asserted-by":"crossref","unstructured":"Wang, Z., O\u2019Boyle, M.: Machine learning in compiler optimisation. In: Proceedings of IEEE (2018)","DOI":"10.1109\/JPROC.2018.2817118"},{"key":"39_CR194","doi-asserted-by":"crossref","unstructured":"Wang, Z., O\u2019Boyle, M.F.: Partitioning streaming parallelism for multi-cores: a machine learning based approach. In: PACT (2010)","DOI":"10.1145\/1854273.1854313"},{"key":"39_CR195","doi-asserted-by":"crossref","unstructured":"Wang, Z., O\u2019Boyle, M.F.: Using machine learning to partition streaming programs. ACM TACO (2013)","DOI":"10.1145\/2512436"},{"key":"39_CR196","doi-asserted-by":"crossref","unstructured":"Wang, Z.: Machine learning based mapping of data and streaming parallelism to multi-cores. Ph.D. thesis, University of Edinburgh (2011)","DOI":"10.1145\/1854273.1854313"},{"key":"39_CR197","doi-asserted-by":"crossref","unstructured":"Wen, Y., et\u00a0al.: Smart multi-task scheduling for opencl programs on cpu\/gpu heterogeneous platforms. In: HiPC (2014)","DOI":"10.1109\/HiPC.2014.7116910"},{"key":"39_CR198","doi-asserted-by":"crossref","unstructured":"Williams, S., et\u00a0al.: The potential of the cell processor for scientific computing. In: Proceedings of the Third Conference on Computing Frontiers (2006)","DOI":"10.1145\/1128022.1128027"},{"key":"39_CR199","doi-asserted-by":"crossref","unstructured":"Wong, H., et\u00a0al.: Demystifying GPU microarchitecture through microbenchmarking. In: IEEE International Symposium on Performance Analysis of Systems and Software, ISPASS (2010)","DOI":"10.1109\/ISPASS.2010.5452013"},{"key":"39_CR200","doi-asserted-by":"crossref","unstructured":"Yan, Y., et\u00a0al.: Supporting multiple accelerators in high-level programming models. In: Proceedings of the Sixth International Workshop on Programming Models and Applications for Multicores and Manycores, PMAM@PPoPP (2015)","DOI":"10.1145\/2712386.2712405"},{"key":"39_CR201","doi-asserted-by":"crossref","unstructured":"Yang, C., et\u00a0al.: O2render: An opencl-to-renderscript translator for porting across various GPUs or CPUs. In: IEEE 10th Symposium on Embedded Systems for Real-time Multimedia, ESTIMedia (2012)","DOI":"10.1109\/ESTIMedia.2012.6507031"},{"key":"39_CR202","doi-asserted-by":"crossref","unstructured":"You, Y., et\u00a0al.: Virtcl: a framework for opencl device abstraction and management. In: Proceedings of the 20th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP (2015)","DOI":"10.1145\/2688500.2688505"},{"key":"39_CR203","doi-asserted-by":"publisher","first-page":"139394","DOI":"10.1109\/ACCESS.2019.2936620","volume":"7","author":"L Yuan","year":"2019","unstructured":"Yuan, L., et al.: Using machine learning to optimize web interactions on heterogeneous mobile systems. IEEE Access 7, 139394\u2013139408 (2019)","journal-title":"IEEE Access"},{"key":"39_CR204","doi-asserted-by":"crossref","unstructured":"Zenker, E., et\u00a0al.: Alpaka\u2014an abstraction library for parallel kernel acceleration. In: 2016 IEEE International Parallel and Distributed Processing Symposium Workshops, IPDPS Workshops (2016)","DOI":"10.1109\/IPDPSW.2016.50"},{"key":"39_CR205","doi-asserted-by":"crossref","unstructured":"Zhang, P., et\u00a0al.: Auto-tuning streamed applications on intel xeon phi. In: 2018 IEEE International Parallel and Distributed Processing Symposium, IPDPS (2018a)","DOI":"10.1109\/IPDPS.2018.00061"},{"key":"39_CR206","doi-asserted-by":"crossref","unstructured":"Zhang, P., et\u00a0al.: MOCL: an efficient opencl implementation for the matrix-2000 architecture. In: Proceedings of the 15th ACM International Conference on Computing Frontiers, CF (2018bb)","DOI":"10.1145\/3203217.3203244"},{"key":"39_CR207","doi-asserted-by":"crossref","unstructured":"Zhang, P., et\u00a0al.: Optimizing streaming parallelism on heterogeneous many-core architectures. IEEE TPDS (2020)","DOI":"10.1109\/TPDS.2020.2978045"},{"key":"39_CR208","doi-asserted-by":"crossref","unstructured":"Zhao, J., et\u00a0al.: Predicting cross-core performance interference on multicore processors with regression analysis. IEEE TPDS (2016)","DOI":"10.1109\/TPDS.2015.2442983"},{"key":"39_CR209","unstructured":"ZiiLABS OpenCL.: http:\/\/www.ziilabs.com\/products\/ software\/opencl.php (2020)"}],"container-title":["CCF Transactions on High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-020-00039-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42514-020-00039-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-020-00039-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,5]],"date-time":"2022-11-05T08:53:38Z","timestamp":1667638418000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42514-020-00039-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,7,31]]},"references-count":209,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2020,12]]}},"alternative-id":["39"],"URL":"https:\/\/doi.org\/10.1007\/s42514-020-00039-4","relation":{},"ISSN":["2524-4922","2524-4930"],"issn-type":[{"value":"2524-4922","type":"print"},{"value":"2524-4930","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,7,31]]},"assertion":[{"value":"18 February 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 May 2020","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 July 2020","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}