{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,6,1]],"date-time":"2024-06-01T05:14:11Z","timestamp":1717218851261},"reference-count":27,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2014,12,13]],"date-time":"2014-12-13T00:00:00Z","timestamp":1418428800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2015,8]]},"DOI":"10.1007\/s11227-014-1338-z","type":"journal-article","created":{"date-parts":[[2014,12,12]],"date-time":"2014-12-12T06:58:46Z","timestamp":1418367526000},"page":"2900-2921","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["A statistical performance analyzer framework for OpenCL kernels on Nvidia GPUs"],"prefix":"10.1007","volume":"71","author":[{"given":"Ali","family":"Karami","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Farshad","family":"Khunjush","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Seyyed Ali","family":"Mirsoleimani","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2014,12,13]]},"reference":[{"key":"1338_CR1","doi-asserted-by":"crossref","unstructured":"Baghsorkhi SS, Delahaye M, Patel SJ, Gropp WD, Hwu WW (2010) An adaptive performance modeling tool for GPU architectures. In: ACM SIGPLAN notices, vol 45, pp 105\u2013114. ACM, New York","DOI":"10.1145\/1837853.1693470"},{"key":"1338_CR2","doi-asserted-by":"crossref","unstructured":"Bakhoda A, Yuan GL, Fung WWL, Wong H, Aamodt TM (2009) Analyzing CUDA workloads using a detailed GPU simulator. In: 2009 IEEE international symposium on performance analysis of systems and software, pp 163\u2013174","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"1338_CR3","doi-asserted-by":"crossref","unstructured":"Che S, Boyer M, Meng J, Tarjan D, Sheaffer JW, Lee S-H, Skadron K (2009) Rodinia: a benchmark suite for heterogeneous computing. In: 2009 IEEE international symposium on workload characterization (IISWC), vol 2009, pp 44\u201354","DOI":"10.1109\/IISWC.2009.5306797"},{"issue":"8","key":"1338_CR4","doi-asserted-by":"crossref","first-page":"391","DOI":"10.1016\/j.parco.2011.10.002","volume":"38","author":"Peng Du","year":"2012","unstructured":"Du Peng, Weber R, Luszczek P, Tomov S, Peterson G, Dongarra J (2012) From CUDA to OpenCL: towards a performance-portable solution for multi-platform GPU programming. Parallel Comput 38(8):391\u2013407","journal-title":"Parallel Comput"},{"key":"1338_CR5","doi-asserted-by":"crossref","unstructured":"Goswami N, Shankar R, Joshi M, Li T (2010) Exploring GPGPU workloads: characterization methodology, analysis and microarchitecture evaluation implications. In: Proceedings of the IEEE international symposium on workload characterization (IISWC\u201910), pp 1\u201310, Washington, DC","DOI":"10.1109\/IISWC.2010.5649549"},{"key":"1338_CR6","doi-asserted-by":"crossref","unstructured":"Jia W, Shaw KA, Martonosi M (2012) Stargazer: automated regression-based GPU design space exploration. In: IEEE international symposium on performance analysis of systems and software ISPASS, pp 2\u201313","DOI":"10.1109\/ISPASS.2012.6189201"},{"key":"1338_CR7","doi-asserted-by":"crossref","unstructured":"Joseph PJ, Vaswani K, Thazhuthaveetil MJ (2006) Construction and use of linear regression models for processor performance analysis. In: The 12th international symposium on high-performance computer architecture, pp 99\u2013108","DOI":"10.1109\/HPCA.2006.1598116"},{"key":"1338_CR8","doi-asserted-by":"crossref","unstructured":"Kerr A, Anger E, Hendry G, Yalamanchili S (2012) Eiger: a framework for the automated synthesis of statistical performance models. In: 19th international conference on high performance computing, pp 1\u20136, Los Alamitos. IEEE Computer Society","DOI":"10.1109\/HiPC.2012.6507525"},{"key":"1338_CR9","doi-asserted-by":"crossref","unstructured":"Kerr A, Diamos G, Yalamanchili S (2009) A characterization and analysis of PTX kernels. In: 2009 IEEE international symposium on workload characterization (IISWC), pp 3\u201312","DOI":"10.1109\/IISWC.2009.5306801"},{"key":"1338_CR10","doi-asserted-by":"crossref","unstructured":"Kerr A, Diamos G, Yalamanchili S (2010) Modeling GPU\u2013CPU workloads and systems. In: Proceedings of the 3rd workshop on general-purpose computation on graphics processing units (GPGPU \u201910), pp 31\u201342, New York. ACM Press, New York","DOI":"10.1145\/1735688.1735696"},{"key":"1338_CR11","unstructured":"Kohavi R (1995) A study of cross-validation and bootstrap for accuracy estimation and model selection. In: Proceedings of the 14th international joint conference on artificial intelligence (IJCAI\u201995), vol 2, pp 1137\u20131143, San Francisco. Morgan Kaufmann Publishers Inc., Menlo Park"},{"key":"1338_CR12","unstructured":"Lopez-Novoa U, Mendiburu A, Miguel-Alonso J (2014) A survey of performance modeling and simulation techniques for accelerator-based computing. IEEE Trans Parallel Distrib Syst 9219(c):1\u20131"},{"key":"1338_CR13","doi-asserted-by":"crossref","unstructured":"Manly BFJ (2004) Multivariate statistical methods: a primer, 3rd edn. Chapman and Hall, London","DOI":"10.1201\/b16974"},{"key":"1338_CR14","unstructured":"Montgomery DC, Runger GC (2010) Applied statistics and probability for engineers, 5th edn. Wiley, New York"},{"key":"1338_CR15","unstructured":"Munshi A (2011) The OpenCL specification"},{"key":"1338_CR16","unstructured":"Nguyen H (2007) Gpu gems 3, 1st edn. Addison-Wesley Professional, Menlo Park"},{"key":"1338_CR17","unstructured":"NVIDIA (2011) CUDA tools SDK CUPTI users guide"},{"key":"1338_CR18","unstructured":"NVIDIA (2012) CUDA SDK 4.1"},{"key":"1338_CR19","unstructured":"NVIDIA (2014) CUDA C programming guide"},{"key":"1338_CR20","unstructured":"NVIDIA (2014) NVIDIA visual profiler"},{"key":"1338_CR21","doi-asserted-by":"crossref","unstructured":"Purnomo B, Rubin N, Houston M (2010) ATI stream profiler: a tool to optimize an OpenCL kernel on ATI Radeon GPUs. In: ACM SIGGRAPH 2010 Posters (SIGGRAPH \u201910), New York. ACM, New York","DOI":"10.1145\/1836845.1836904"},{"issue":"3","key":"1338_CR22","doi-asserted-by":"crossref","first-page":"66","DOI":"10.1109\/MCSE.2010.69","volume":"12","author":"JE Stone","year":"2010","unstructured":"Stone JE, Gohara D, Shi G (2010) OpenCL: a parallel programming standard for heterogeneous computing systems. Comput Sci Eng 12(3):66\u201372","journal-title":"Comput Sci Eng"},{"key":"1338_CR23","doi-asserted-by":"crossref","first-page":"50","DOI":"10.1109\/MM.2011.24","volume":"31","author":"CM Wittenbrink","year":"2011","unstructured":"Wittenbrink CM, Kilgariff E, Prabhu A (2011) Fermi GF100 GPU architecture. IEEE Micro 31:50\u201359","journal-title":"IEEE Micro"},{"issue":"13","key":"1338_CR24","doi-asserted-by":"crossref","first-page":"37","DOI":"10.1016\/0169-7439(87)80084-9","volume":"2","author":"S Wold","year":"1987","unstructured":"Wold S, Esbensen K, Geladi P (1987) Principal component analysis. Chemom Intell Lab Syst 2(13):37\u201352","journal-title":"Chemom Intell Lab Syst"},{"key":"1338_CR25","doi-asserted-by":"crossref","unstructured":"Zhang Y, Owens JD (2011) A quantitative performance analysis model for GPU architectures. In: IEEE 17th international symposium on high performance computer architecture, pp 382\u2013393","DOI":"10.1109\/HPCA.2011.5749745"},{"key":"1338_CR26","doi-asserted-by":"crossref","unstructured":"Zhang Y, Hu Y, Li B, Peng L (2011) Performance and power analysis of ATI GPU: a statistical approach. In: 6th IEEE international conference on networking, architecture and storage (NAS), pp 149\u2013158","DOI":"10.1109\/NAS.2011.51"},{"key":"1338_CR27","doi-asserted-by":"crossref","unstructured":"Zhang Y, Peng L, Li B, Peir J-K, Chen J (2011) Architecture comparisons between Nvidia and ATI GPUs: computation parallelism and data communications. In: 2011 IEEE international symposium on workload characterization (IISWC), pp 205\u2013215","DOI":"10.1109\/IISWC.2011.6114180"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-014-1338-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-014-1338-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-014-1338-z","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,8,18]],"date-time":"2019-08-18T11:15:38Z","timestamp":1566126938000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-014-1338-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014,12,13]]},"references-count":27,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2015,8]]}},"alternative-id":["1338"],"URL":"https:\/\/doi.org\/10.1007\/s11227-014-1338-z","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2014,12,13]]}}}