{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,3]],"date-time":"2025-11-03T13:28:42Z","timestamp":1762176522967},"reference-count":12,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2011,7]]},"DOI":"10.1109\/hpcsim.2011.5999886","type":"proceedings-article","created":{"date-parts":[[2011,8,31]],"date-time":"2011-08-31T19:18:55Z","timestamp":1314818335000},"page":"631-639","source":"Crossref","is-referenced-by-count":29,"title":["Understanding the impact of CUDA tuning techniques for Fermi"],"prefix":"10.1109","author":[{"given":"Yuri","family":"Torres","sequence":"first","affiliation":[]},{"given":"Arturo","family":"Gonzalez-Escribano","sequence":"additional","affiliation":[]},{"given":"Diego R.","family":"Llanos","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"journal-title":"Programming Massively Parallel Processors A Hands-on Approach","year":"2010","author":"kirk","key":"3"},{"key":"2","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735696"},{"key":"10","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735697"},{"journal-title":"Nvidia Optimizing Matrix Transpose in CUDA","year":"2010","key":"1"},{"journal-title":"Fermi Architecture Home Page","year":"2010","key":"7"},{"journal-title":"Tuning CUDA Applications for Fermi","year":"2010","key":"6"},{"journal-title":"NVIDIA CUDA ProgrammingGuide 3 0 Fermi","year":"2010","key":"5"},{"key":"4","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735698"},{"journal-title":"Modeling Execution and Predicting Performance in Multi-GPU Environments in Electrical and Computer Engineering","year":"2009","author":"schaa","key":"9"},{"key":"8","doi-asserted-by":"publisher","DOI":"10.1145\/1345206.1345220"},{"key":"11","first-page":"237","article-title":"Auto-tuning dense matrix multiplication for GPGPU with cache","author":"zhang","year":"2010","journal-title":"Proc ICPADS'2010"},{"key":"12","first-page":"343","article-title":"An optimizing compiler for GPGPU programs with input-data sharing","author":"yi","year":"2010","journal-title":"Proceedings of PPoPP"}],"event":{"name":"Simulation (HPCS)","start":{"date-parts":[[2011,7,4]]},"location":"Istanbul, Turkey","end":{"date-parts":[[2011,7,8]]}},"container-title":["2011 International Conference on High Performance Computing &amp; Simulation"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx5\/5979720\/5999789\/05999886.pdf?arnumber=5999886","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2017,3,21]],"date-time":"2017-03-21T10:17:31Z","timestamp":1490091451000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/5999886\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2011,7]]},"references-count":12,"URL":"https:\/\/doi.org\/10.1109\/hpcsim.2011.5999886","relation":{},"subject":[],"published":{"date-parts":[[2011,7]]}}}