{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,26]],"date-time":"2025-09-26T00:08:42Z","timestamp":1758845322342},"reference-count":38,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2015,3,20]],"date-time":"2015-03-20T00:00:00Z","timestamp":1426809600000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2016,6]]},"DOI":"10.1007\/s10766-015-0362-9","type":"journal-article","created":{"date-parts":[[2015,3,19]],"date-time":"2015-03-19T07:24:53Z","timestamp":1426749893000},"page":"620-643","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Locality-Aware Automatic Parallelization for GPGPU with OpenHMPP Directives"],"prefix":"10.1007","volume":"44","author":[{"given":"Jos\u00e9 M.","family":"Andi\u00f3n","sequence":"first","affiliation":[]},{"given":"Manuel","family":"Arenaz","sequence":"additional","affiliation":[]},{"given":"Fran\u00e7ois","family":"Bodin","sequence":"additional","affiliation":[]},{"given":"Gabriel","family":"Rodr\u00edguez","sequence":"additional","affiliation":[]},{"given":"Juan","family":"Touri\u00f1o","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2015,3,20]]},"reference":[{"issue":"9","key":"362_CR1","doi-asserted-by":"crossref","first-page":"442","DOI":"10.1016\/j.parco.2013.04.003","volume":"39","author":"JM Andi\u00f3n","year":"2013","unstructured":"Andi\u00f3n, J.M., Arenaz, M., Rodr\u00edguez, G., Touri\u00f1o, J.: A novel compiler support for automatic parallelization on multicore systems. Parallel Comput. 39(9), 442\u2013460 (2013)","journal-title":"Parallel Comput."},{"issue":"18","key":"362_CR2","doi-asserted-by":"crossref","first-page":"2407","DOI":"10.1002\/cpe.1173","volume":"19","author":"D Andrade","year":"2007","unstructured":"Andrade, D., Arenaz, M., Fraguela, B.B., Touri\u00f1o, J., Doallo, R.: Automated and accurate cache behavior analysis for codes with irregular access patterns. Concurr. Comput. Pract. Exp. 19(18), 2407\u20132423 (2007)","journal-title":"Concurr. Comput. Pract. Exp."},{"key":"362_CR3","unstructured":"Appentra Solutions: Parallware for OpenACC. http:\/\/www.appentra.com\/products\/parallware\/ . Accessed 31 Jan 2015"},{"key":"362_CR4","doi-asserted-by":"crossref","unstructured":"Arenaz, M., Touri\u00f1o, J., Doallo, R.: Compiler support for parallel code generation through kernel recognition. In: Proceedings of the 18th International Parallel and Distributed Processing Symposium (IPDPS), Santa Fe, NM, USA, p. 79b. IEEE (2004)","DOI":"10.1109\/IPDPS.2004.1303015"},{"issue":"6","key":"362_CR5","doi-asserted-by":"crossref","first-page":"32:1","DOI":"10.1145\/1391956.1391959","volume":"30","author":"M Arenaz","year":"2008","unstructured":"Arenaz, M., Touri\u00f1o, J., Doallo, R.: XARK: an extensible framework for automatic recognition of computational kernels. ACM Trans. Program. Lang. Syst. 30(6), 32:1\u201332:56 (2008)","journal-title":"ACM Trans. Program. Lang. Syst."},{"key":"362_CR6","doi-asserted-by":"crossref","unstructured":"Baskaran, M.M., Ramanujam, J., Sadayappan, P.: Automatic C-to-CUDA code generation for affine programs. In: Proceedings of the 19th International Conference on Compiler Construction (CC), Paphos, Cyprus, LNCS, vol. 6011, pp. 244\u2013263. Springer (2010)","DOI":"10.1007\/978-3-642-11970-5_14"},{"key":"362_CR7","unstructured":"BLAS: Basic Linear Algebra Subprograms. http:\/\/www.netlib.org\/blas\/ . Accessed 31 Jan 2015"},{"issue":"4","key":"362_CR8","first-page":"325","volume":"17","author":"F Bodin","year":"2009","unstructured":"Bodin, F., Bihan, S.: Heterogeneous multicore parallel programming for graphics processing units. Sci. Program. 17(4), 325\u2013336 (2009)","journal-title":"Sci. Program."},{"key":"362_CR9","doi-asserted-by":"crossref","unstructured":"Bondhugula, U., Hartono, A., Ramanujam, J., Sadayappan, P.: A practical automatic polyhedral parallelizer and locality optimizer. In: Proceedings of the 29th Conference on Programming Language Design and Implementation (PLDI), Tucson, AZ, USA, pp. 101\u2013113. ACM (2008)","DOI":"10.1145\/1375581.1375595"},{"issue":"3\u20134","key":"362_CR10","doi-asserted-by":"crossref","first-page":"205","DOI":"10.1007\/s00450-011-0160-6","volume":"26","author":"M Christen","year":"2011","unstructured":"Christen, M., Schenk, O., Burkhart, H.: Automatic code generation and tuning for stencil kernels on modern shared memory architectures. Comp. Sci. Res. Dev. 26(3\u20134), 205\u2013210 (2011)","journal-title":"Comp. Sci. Res. Dev."},{"key":"362_CR11","doi-asserted-by":"crossref","unstructured":"Eigenmann, R., Hoeflinger, J., Li, Z., Padua, D.A.: Experience in the automatic parallelization of four perfect-benchmark programs. In: Proceedings of the 4th International Workshop on Languages and Compilers for Parallel Computing (LCPC), Santa Clara, CA, USA, LNCS, vol. 589, pp. 65\u201383. Springer (1992)","DOI":"10.1007\/BFb0038658"},{"key":"362_CR12","doi-asserted-by":"crossref","unstructured":"Grauer-Gray, S., Xu, L., Searles, R., Ayalasomayajula, S., Cavazos, J.: Auto-tuning a high-level language targeted to GPU codes. In: Proceedings of Innovative Parallel Computing (InPar), San Jose, CA, USA, pp. 1\u201310. IEEE (2012)","DOI":"10.1109\/InPar.2012.6339595"},{"issue":"1","key":"362_CR13","doi-asserted-by":"crossref","first-page":"78","DOI":"10.1109\/TPDS.2010.62","volume":"22","author":"TD Han","year":"2011","unstructured":"Han, T.D., Abdelrahman, T.S.: hiCUDA: High-level GPGPU programming. IEEE Trans. Parallel Distrib. Syst. 22(1), 78\u201390 (2011)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"362_CR14","unstructured":"HPC Project: Par4All. http:\/\/www.par4all.org\/ . Accessed 31 Jan 2015"},{"key":"362_CR15","unstructured":"Intel Corporation: Intel Math Kernel Library. http:\/\/software.intel.com\/intel-mkl\/ . Accessed 31 Jan 2015"},{"key":"362_CR16","doi-asserted-by":"crossref","unstructured":"Jablin, T.B., Jablin, J.A., Prabhu, P., Liu, F., August, D.I.: Dynamically managed data for CPU\u2013GPU architectures. In: Proceedings of the 10th International Symposium on Code Generation and Optimization (CGO), San Jose, CA, USA, pp. 165\u2013174. ACM (2012)","DOI":"10.1145\/2259016.2259038"},{"key":"362_CR17","doi-asserted-by":"crossref","unstructured":"Jablin, T.B., Prabhu, P., Jablin, J.A., Johnson, N.P., Beard, S.R., August, D.I.: Automatic CPU\u2013GPU communication management and optimization. In: Proceedings of the 32nd Conference on Programming Language Design and Implementation (PLDI), San Jose, CA, USA, pp. 142\u2013151. ACM (2011)","DOI":"10.1145\/1993498.1993516"},{"issue":"11","key":"362_CR18","doi-asserted-by":"crossref","first-page":"2045","DOI":"10.1109\/TPDS.2011.311","volume":"23","author":"J Kurzak","year":"2012","unstructured":"Kurzak, J., Tomov, S., Dongarra, J.: Autotuning GEMM kernels for the Fermi GPU. IEEE Trans. Parallel Distrib. Syst. 23(11), 2045\u20132057 (2012)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"362_CR19","doi-asserted-by":"crossref","unstructured":"Larsen, E.S., McAllister, D.: Fast matrix multiplies using graphics hardware. In: Proceedings of the 14th International Conference on High Performance Computing, Networking, Storage and Analysis (SC), Denver, CO, USA, p. 55. ACM (2001)","DOI":"10.1145\/582034.582089"},{"key":"362_CR20","doi-asserted-by":"crossref","unstructured":"Lee, S., Eigenmann, R.: OpenMPC: Extended OpenMP programming and tuning for GPUs. In: Proceedings of the 23rd International Conference on High Performance Computing, Networking, Storage and Analysis (SC), New Orleans, LA, USA, pp. 1\u201311. IEEE (2010)","DOI":"10.1109\/SC.2010.36"},{"key":"362_CR21","doi-asserted-by":"crossref","unstructured":"Lee, S., Vetter, J.S.: Early evaluation of directive-based GPU programming models for productive exascale computing. In: Proceedings of the 25th International Conference on High Performance Computing, Networking, Storage and Analysis (SC), Salt Lake City, UT, USA, pp. 23:1\u201323:11. IEEE (2012)","DOI":"10.1109\/SC.2012.51"},{"key":"362_CR22","unstructured":"Novatte Pte. Ltd.: CAPS Compilers. http:\/\/www.novatte.com\/component\/content\/article\/126-products\/hpcclusters\/301-caps-compilers-for-cuda-and-opencl\/ . Accessed 31 Jan 2015"},{"key":"362_CR23","unstructured":"NVIDIA Corporation: Cg Toolkit. http:\/\/developer.nvidia.com\/Cg\/ . Accessed 31 Jan 2015"},{"key":"362_CR24","unstructured":"NVIDIA Corporation: CUBLAS Library. https:\/\/developer.nvidia.com\/cublas\/ . Accessed 31 Jan 2015"},{"key":"362_CR25","unstructured":"NVIDIA Corporation: CUDA C Best Practices Guide. http:\/\/docs.nvidia.com\/cuda\/cuda-c-best-practices-guide\/ . Accessed 31 Jan 2015"},{"key":"362_CR26","unstructured":"NVIDIA Corporation: CUDA C Programming Guide. http:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/ . Accessed 31 Jan 2015"},{"key":"362_CR27","unstructured":"OpenHMPP Consortium: OpenHMPP Concepts and Directives. http:\/\/en.wikipedia.org\/wiki\/OpenHMPP . Accessed 31 Jan 2015"},{"key":"362_CR28","unstructured":"OpenMP Architecture Review Board: OpenMP Application Program Interface (Version 4.0). http:\/\/www.openmp.org\/mp-documents\/OpenMP4.0.0.pdf . Accessed 31 Jan 2015"},{"issue":"5","key":"362_CR29","doi-asserted-by":"crossref","first-page":"879","DOI":"10.1109\/JPROC.2008.917757","volume":"96","author":"J Owens","year":"2008","unstructured":"Owens, J., Houston, M., Luebke, D., Green, S., Stone, J., Phillips, J.: GPU computing. Proc. IEEE 96(5), 879\u2013899 (2008)","journal-title":"Proc. IEEE"},{"key":"362_CR30","unstructured":"The Khronos Group Inc.: The OpenCL Specification (Version 2.0). http:\/\/www.khronos.org\/registry\/cl\/specs\/opencl-2.0.pdf . Accessed 31 Jan 2015"},{"key":"362_CR31","unstructured":"The Khronos Group Inc.: The OpenGL Shading Language (Version 4.50). https:\/\/www.opengl.org\/registry\/doc\/GLSLangSpec.4.50.pdf . Accessed 31 Jan 2015"},{"key":"362_CR32","unstructured":"The OpenACC Standards Group: The OpenACC Application Programming Interface (Version 2.0a). http:\/\/www.openacc.org\/sites\/default\/files\/OpenACC.2.0a_1.pdf . Accessed 31 Jan 2015"},{"issue":"4","key":"362_CR33","doi-asserted-by":"crossref","first-page":"54:1","DOI":"10.1145\/2400682.2400713","volume":"9","author":"S Verdoolaege","year":"2013","unstructured":"Verdoolaege, S., Juega, J.C., Cohen, A., G\u00f3mez, J.I., Tenllado, C., Catthoor, F.: Polyhedral parallel code generation for CUDA. ACM Trans. Archit. Code Optim. 9(4), 54:1\u201354:23 (2013)","journal-title":"ACM Trans. Archit. Code Optim."},{"issue":"8","key":"362_CR34","doi-asserted-by":"crossref","first-page":"1153","DOI":"10.1002\/cpe.2917","volume":"25","author":"M Vi\u00f1as","year":"2013","unstructured":"Vi\u00f1as, M., Lobeiras, J., Fraguela, B.B., Arenaz, M., Amor, M., Garc\u00eda, J.A., Castro, M.J., Doallo, R.: A multi-GPU shallow-water simulation with transport of contaminants. Concurr. Comput. Pract. Exp. 25(8), 1153\u20131169 (2013)","journal-title":"Concurr. Comput. Pract. Exp."},{"key":"362_CR35","unstructured":"Volkov, V.: Better performance at lower occupancy. In: Proceedings of the 2010 GPU technology conference (GTC), San Jose, CA, USA. NVIDIA (2010)"},{"key":"362_CR36","doi-asserted-by":"crossref","unstructured":"Wolfe, M.: Implementing the PGI accelerator model. In: Proceedings of the 3rd Workshop on General Purpose Processing on Graphics Processing Units (GPGPU), Pittsburgh, PA, USA, pp. 43\u201350. ACM (2010)","DOI":"10.1145\/1735688.1735697"},{"key":"362_CR37","doi-asserted-by":"crossref","unstructured":"Zima, E.: Simplification and optimization of transformations of chains of recurrences. In: Proceedings of the 1995 International Symposium on Symbolic and Algebraic Computation (ISSAC), Montreal, Canada, pp. 42\u201350. ACM (1995)","DOI":"10.1145\/220346.220352"},{"issue":"3","key":"362_CR38","doi-asserted-by":"crossref","first-page":"417","DOI":"10.1109\/TPDS.2012.160","volume":"24","author":"Y Zhang","year":"2013","unstructured":"Zhang, Y., Mueller, F.: Autogeneration and autotuning of 3D stencil codes on homogeneous and heterogeneous GPU clusters. IEEE Trans. Parallel Distrib. Syst. 24(3), 417\u2013427 (2013)","journal-title":"IEEE Trans. Parallel Distrib. Syst."}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-015-0362-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10766-015-0362-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-015-0362-9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,5,31]],"date-time":"2019-05-31T00:02:33Z","timestamp":1559260953000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10766-015-0362-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,3,20]]},"references-count":38,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2016,6]]}},"alternative-id":["362"],"URL":"https:\/\/doi.org\/10.1007\/s10766-015-0362-9","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"value":"0885-7458","type":"print"},{"value":"1573-7640","type":"electronic"}],"subject":[],"published":{"date-parts":[[2015,3,20]]}}}