{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T04:16:04Z","timestamp":1750306564846,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":17,"publisher":"ACM","license":[{"start":{"date-parts":[[2015,1,19]],"date-time":"2015-01-19T00:00:00Z","timestamp":1421625600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2015,1,19]]},"DOI":"10.1145\/2701310.2701316","type":"proceedings-article","created":{"date-parts":[[2015,2,17]],"date-time":"2015-02-17T13:44:08Z","timestamp":1424180648000},"page":"31-36","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Auto-Tuning OmpSs-OpenCL Kernels Across GPU Machines"],"prefix":"10.1145","author":[{"given":"Vinoth Krishnan","family":"Elangovan","sequence":"first","affiliation":[{"name":"Barcelona Supercomputing Center, Universitat Polit\u00e9cnica de Catalunya, Spain"}]},{"given":"Rosa M.","family":"Badia","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Artificial Intelligence Research Institute (IIIA), Spanish National Research Council (CSIC), Spain"}]},{"given":"Eduard","family":"Ayguad\u00e9","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Universitat Polit\u00e9cnica de Catalunya, Spain"}]}],"member":"320","published-online":{"date-parts":[[2015,1,19]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Programming guide","author":"Nvidia C. U. D. A.","year":"2008","unstructured":"Nvidia , C. U. D. A. Programming guide . 2008 . Nvidia, C. U. D. A. Programming guide. 2008."},{"key":"e_1_3_2_1_2_1","unstructured":"Kepler Architecture - White Paper http:\/\/www.nvidia.com\/object\/nvidia-kepler.html  Kepler Architecture - White Paper http:\/\/www.nvidia.com\/object\/nvidia-kepler.html"},{"key":"e_1_3_2_1_3_1","unstructured":"Benedict Gaster. et al. Heterogeneous Computing with OpenCL Morgan Kaufmann. 2011   Benedict Gaster. et al. Heterogeneous Computing with OpenCL Morgan Kaufmann. 2011"},{"key":"e_1_3_2_1_4_1","volume-title":"OmpSs-OpenCL Programming Model for Heterogeneous Systems. LCPC","author":"Elangovan","year":"2012","unstructured":"Elangovan , et al. OmpSs-OpenCL Programming Model for Heterogeneous Systems. LCPC 2012 . Elangovan, et al. OmpSs-OpenCL Programming Model for Heterogeneous Systems. LCPC 2012."},{"key":"e_1_3_2_1_5_1","volume-title":"Scalability and Parallel Execution of OmpSs-OpenCL Tasks on Heterogeneous CPU-GPU Environment. ISC","author":"Elangovan","year":"2014","unstructured":"Elangovan , et al. Scalability and Parallel Execution of OmpSs-OpenCL Tasks on Heterogeneous CPU-GPU Environment. ISC 2014 . Elangovan, et al. Scalability and Parallel Execution of OmpSs-OpenCL Tasks on Heterogeneous CPU-GPU Environment. ISC 2014."},{"key":"e_1_3_2_1_6_1","unstructured":"Manish. The Architecture and Evolution of CPU-GPU Systems for General Purpose Computing. UCSD.  Manish. The Architecture and Evolution of CPU-GPU Systems for General Purpose Computing. UCSD."},{"key":"e_1_3_2_1_7_1","volume-title":"A static task partitioning approach for heterogeneous systems using OpenCL. Compiler Construction","author":"Grewe","year":"2011","unstructured":"Grewe , et al. A static task partitioning approach for heterogeneous systems using OpenCL. Compiler Construction 2011 . Grewe, et al. A static task partitioning approach for heterogeneous systems using OpenCL. Compiler Construction 2011."},{"key":"e_1_3_2_1_8_1","volume-title":"SAAHPC","author":"Rul Sean","year":"2010","unstructured":"Rul , Sean , et al. An experimental study on performance portability of OpenCL kernels . SAAHPC 2010 . Rul, Sean, et al. An experimental study on performance portability of OpenCL kernels. SAAHPC 2010."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339595"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2400682.2400718"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735702"},{"key":"e_1_3_2_1_12_1","unstructured":"AMD OpenCL SDK Benchmarks. AMD Inc.  AMD OpenCL SDK Benchmarks. AMD Inc."},{"key":"e_1_3_2_1_13_1","unstructured":"CUDA Occupancy Calculator NVidia 2009.  CUDA Occupancy Calculator NVidia 2009."},{"key":"e_1_3_2_1_14_1","volume-title":"Improving Performance Portability in OpenCL Programs. SC","author":"Zhang","year":"2013","unstructured":"Zhang , et al. Improving Performance Portability in OpenCL Programs. SC 2013 . Zhang, et al. Improving Performance Portability in OpenCL Programs. SC 2013."},{"key":"e_1_3_2_1_15_1","volume-title":"GPUs and beyond.","author":"Garg","year":"2013","unstructured":"Garg , Rahul et al. A portable- and high-performance matrix operations library for CPUs , GPUs and beyond. 2013 . Garg, Rahul et al. A portable- and high-performance matrix operations library for CPUs, GPUs and beyond. 2013."},{"key":"e_1_3_2_1_16_1","unstructured":"Lee Joo Hwan et al. OpenCL Performance Evaluation on Modern Multi Core CPUs.  Lee Joo Hwan et al. OpenCL Performance Evaluation on Modern Multi Core CPUs."},{"key":"e_1_3_2_1_17_1","volume-title":"Workshop on automatic performance tuning.","author":"Komatsu Kazuhiko","year":"2010","unstructured":"Komatsu , Kazuhiko , et al. Evaluating performance and portability of OpenCL programs . Workshop on automatic performance tuning. 2010 . Komatsu, Kazuhiko, et al. Evaluating performance and portability of OpenCL programs. Workshop on automatic performance tuning. 2010."}],"event":{"name":"PARMA-DITAM '15: 6th Workshop on Parallel Programming and Run-Time Management Techniques for Many-core Architectures and 4th Workshop on Design Tools and Architectures for Multicore Embedded Computing Platforms","acronym":"PARMA-DITAM '15","location":"Amsterdam Netherlands"},"container-title":["Proceedings of the 6th Workshop on Parallel Programming and Run-Time Management Techniques for Many-core Architectures"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2701310.2701316","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2701310.2701316","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T06:13:09Z","timestamp":1750227189000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2701310.2701316"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,1,19]]},"references-count":17,"alternative-id":["10.1145\/2701310.2701316","10.1145\/2701310"],"URL":"https:\/\/doi.org\/10.1145\/2701310.2701316","relation":{},"subject":[],"published":{"date-parts":[[2015,1,19]]},"assertion":[{"value":"2015-01-19","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}