{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:12:20Z","timestamp":1750219940343,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,2,25]],"date-time":"2023-02-25T00:00:00Z","timestamp":1677283200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,2,25]]},"DOI":"10.1145\/3589236.3589241","type":"proceedings-article","created":{"date-parts":[[2023,6,21]],"date-time":"2023-06-21T00:32:47Z","timestamp":1687307567000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["GPU Auto-tuning Framework for Optimal Performance and Power Consumption"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1437-479X","authenticated-orcid":false,"given":"Sunbal","family":"Cheema","sequence":"first","affiliation":[{"name":"Electrical, Computer and Biomedical Engineering, Toronto Metropolitan (formerly Ryerson) University, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3169-5952","authenticated-orcid":false,"given":"Gul","family":"Khan","sequence":"additional","affiliation":[{"name":"Electrical, Computer and Biomedical Engineering, Toronto Metropolitan (formerly Ryerson) University, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1038\/d41586-018-06610-y"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2013.05.299"},{"key":"e_1_3_2_1_3_1","first-page":"1","volume-title":"Int. Conf. on High Performance Computing, Networking, Storage and Analysis","author":"H.","year":"2012","unstructured":"H. Jordan\u00a0et al. 2012. A multi-objective auto-tuning framework for parallel codes. Int. Conf. on High Performance Computing, Networking, Storage and Analysis, pp. 1-12."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628087"},{"key":"e_1_3_2_1_5_1","first-page":"1231","volume-title":"IEEE International Parallel and Distributed Processing Symposium Workshop","author":"Falch T. L.","unstructured":"T. L. Falch and A. C. Elster. 2015. Machine Learning based Auto-Tuning for Enhanced OpenCL Performance Portability. IEEE International Parallel and Distributed Processing Symposium Workshop, pp. 1231-1240."},{"key":"e_1_3_2_1_6_1","first-page":"162","volume-title":"Int. Conf. on Parallel Processing","author":"Fang J.","year":"2014","unstructured":"J. Fang, H. Sips, P. Jaaskelainen, and A. L. Varbanescu. 2014. Grover: Looking for Performance Improvement by Disabling Local Memory Usage in OpenCL Kernels. Int. Conf. on Parallel Processing, 2014, pp. 162-171."},{"key":"e_1_3_2_1_7_1","first-page":"508","volume-title":"3rd Int. Symp. on Computing and Networking","author":"Hirasawa S.","unstructured":"S. Hirasawa, H. Takizawa, and H. Kobayashi. 2015. A Verification Framework for Streamlining Empirical Auto-Tuning. 3rd Int. Symp. on Computing and Networking, pp. 508-514."},{"key":"e_1_3_2_1_8_1","first-page":"736","volume-title":"Automation and Test in Europe Conference & Exhibition (DATE)","author":"Paone E.","year":"2015","unstructured":"E. Paone 2015. Customization of OpenCL Applications for Efficient Task Mapping under Heterogeneous Platform Constraints. Design, Automation and Test in Europe Conference & Exhibition (DATE), pp. 736-741."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysarc.2015.11.012"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628092"},{"key":"e_1_3_2_1_11_1","first-page":"195","volume-title":"CLTune: A Generic Auto-Tuner for OpenCL Kernels. IEEE 9th International Symposium on Embedded Multicore\/Many-core Systems-on-Chip, Turin Italy","author":"Nugteren C.","unstructured":"C. Nugteren and V. Codreanu. 2015. CLTune: A Generic Auto-Tuner for OpenCL Kernels. IEEE 9th International Symposium on Embedded Multicore\/Many-core Systems-on-Chip, Turin Italy, pp. 195-202."},{"key":"e_1_3_2_1_12_1","first-page":"64","volume-title":"ATF: A Generic Auto-Tuning Framework. IEEE 19th International Conference on High Performance Computing and Communication","author":"Rasch A.","unstructured":"A. Rasch, M. Haidl, and S. Gorlatch. 2017. ATF: A Generic Auto-Tuning Framework. IEEE 19th International Conference on High Performance Computing and Communication, pp. 64-71."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2020.02.069"},{"key":"e_1_3_2_1_14_1","first-page":"1","volume-title":"2nd International Conference on Green High Performance Computing (ICGHPC)","author":"Mijakovi\u0107 R.","unstructured":"R. Mijakovi\u0107, M. Firbach, and M. Gerndt. 2016. An architecture for flexible auto-tuning: The Periscope Tuning Framework 2.0. 2nd International Conference on Green High Performance Computing (ICGHPC), pp. 1-9."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2962131"},{"key":"e_1_3_2_1_16_1","first-page":"173","volume-title":"Springer, Berlin Heidelberg","author":"Terpstra D.","unstructured":"D. Terpstra, H. Jagode, H. You and J. Dongarra. 2009. Collecting performance data with PAPI-C. In Tools for High Performance Computing, M. M\u00fcller, M. Resch, A. Schulz and W. Nagel, Eds., Springer, Berlin Heidelberg, pp. 157\u2013173."},{"key":"e_1_3_2_1_17_1","first-page":"122","volume-title":"Statistical Power Modeling of GPU Kernels Using Performance Counters. IEEE Int. Conf. on Green Computing","author":"Nagasaka H.","year":"2010","unstructured":"H. Nagasaka 2010. Statistical Power Modeling of GPU Kernels Using Performance Counters. IEEE Int. Conf. on Green Computing, pp. 115\u2013122."},{"key":"e_1_3_2_1_18_1","first-page":"686","volume-title":"Int. Symp. on Parallel and Distributed Processing","author":"Song S.","unstructured":"S. Song, C. Su, B. Rountree, and K. W. Cameron. 2013. A Simplified and Accurate Model of Power-Performance Efficiency on Emergent GPU Architectures. Int. Symp. on Parallel and Distributed Processing, pp. 673\u2013686"},{"key":"e_1_3_2_1_19_1","unstructured":"NVIDIA. NVML API Reference Manual. Available online: https:\/\/developer.nvidia.com\/nvidia-management-library-nvml"},{"key":"e_1_3_2_1_20_1","volume-title":"User's Guide","author":"CUPTI","year":"2013","unstructured":"CUPTI: User's Guide, NVIDIA Corporation, July 2013. Available online: https:\/\/docs.nvidia.com\/cuda\/cupti\/index.html"},{"key":"e_1_3_2_1_21_1","first-page":"254","volume-title":"Online Power Estimation of Graphics Processing Units. 16th IEEE\/ACM Int. Symp. on Cluster, Cloud and Grid Computing","author":"Adhinarayanan V.","year":"2016","unstructured":"V. Adhinarayanan, B. Subramaniam, and W. chun Feng. 2016. Online Power Estimation of Graphics Processing Units. 16th IEEE\/ACM Int. Symp. on Cluster, Cloud and Grid Computing, Cartagena, Columbia, pp. 245\u2013254."},{"key":"e_1_3_2_1_22_1","first-page":"28","volume-title":"ACM Workshop on General Purpose Processing Using GPUs","author":"Burtscher M.","unstructured":"M. Burtscher, I. Zecena, and Z. Zong. 2014. Measuring GPU power with the K20 built-in sensor. ACM Workshop on General Purpose Processing Using GPUs, pp. 28-36."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/4235.996017"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"A. Blot M. Kessaci and L. Jourdan. 2018. Survey and unification of local search techniques in metaheuristics for multi-objective combinatorial optimisation.\u00a0Journal of Heuristics \u00a0vol. 24 \u00a0no. 6 \u00a0pp. 853-877.","DOI":"10.1007\/s10732-018-9381-1"},{"key":"e_1_3_2_1_25_1","first-page":"1698","volume-title":"Analysis of GPU Power Consumption Using Internal Sensors. 16th Workshop em Desempenho de Sistemas Computacionais e de Comunica\u00e7\u00e3o","author":"Ferro M.","year":"2017","unstructured":"M. Ferro, A. Yokoyama, V. Kl\u00f4h, G. Silva, R. Gandra, R. Bragan\u00e7a \u0327 A. Bulc\u00e3o, B. Schulze. 2017. Analysis of GPU Power Consumption Using Internal Sensors. 16th Workshop em Desempenho de Sistemas Computacionais e de Comunica\u00e7\u00e3o, pp. 1698-1711."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1155\/2014\/657302"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.5555\/2747903.2748182"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/358923.358929"}],"event":{"name":"GPGPU '23: 15th Workshop on General Purpose Processing Using GPU","acronym":"GPGPU '23","location":"Montreal Canada"},"container-title":["Proceedings of the 15th Workshop on General Purpose Processing Using GPU"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3589236.3589241","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3589236.3589241","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:48:53Z","timestamp":1750182533000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3589236.3589241"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2,25]]},"references-count":28,"alternative-id":["10.1145\/3589236.3589241","10.1145\/3589236"],"URL":"https:\/\/doi.org\/10.1145\/3589236.3589241","relation":{},"subject":[],"published":{"date-parts":[[2023,2,25]]},"assertion":[{"value":"2023-06-20","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}