{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T02:11:26Z","timestamp":1775873486792,"version":"3.50.1"},"publisher-location":"Berlin, Heidelberg","reference-count":27,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"value":"9783642286513","type":"print"},{"value":"9783642286520","type":"electronic"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2012]]},"DOI":"10.1007\/978-3-642-28652-0_1","type":"book-chapter","created":{"date-parts":[[2012,3,22]],"date-time":"2012-03-22T21:02:02Z","timestamp":1332450122000},"page":"1-20","source":"Crossref","is-referenced-by-count":47,"title":["Improving Performance of OpenCL on CPUs"],"prefix":"10.1007","author":[{"given":"Ralf","family":"Karrenberg","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sebastian","family":"Hack","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Allen, J.R., Kennedy, K., Porterfield, C., Warren, J.: Conversion of control dependence to data dependence. In: POPL, pp. 177\u2013189. ACM (1983)","DOI":"10.1145\/567067.567085"},{"issue":"4","key":"1_CR2","doi-asserted-by":"publisher","first-page":"491","DOI":"10.1145\/29873.29875","volume":"9","author":"R. Allen","year":"1987","unstructured":"Allen, R., Kennedy, K.: Automatic translation of FORTRAN programs to vector form. ACM Trans. Program. Lang. Syst.\u00a09(4), 491\u2013542 (1987)","journal-title":"ACM Trans. Program. Lang. Syst."},{"key":"1_CR3","unstructured":"AMD: AMD APP SDK v2.5 (March 2011)"},{"issue":"4","key":"1_CR4","doi-asserted-by":"publisher","first-page":"44","DOI":"10.1109\/38.56298","volume":"10","author":"A. Apodaca","year":"1990","unstructured":"Apodaca, A., Mantle, M.: RenderMan: Pursuing the Future of Graphics. IEEE Computer Graphics & Applications\u00a010(4), 44\u201349 (1990)","journal-title":"IEEE Computer Graphics & Applications"},{"key":"1_CR5","unstructured":"Cheong, G., Lam, M.: An Optimizer for Multimedia Instruction Sets. In: Second SUIF Compiler Workshop (1997)"},{"key":"1_CR6","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4612-1362-8","volume-title":"Scheduling and Automatic Parallelization","author":"A. Darte","year":"2000","unstructured":"Darte, A., Robert, Y., Vivien, F.: Scheduling and Automatic Parallelization. Birkhauser, Boston (2000)"},{"key":"1_CR7","unstructured":"Fritz, N., Lucas, P., Slusallek, P.: CGiS, a New Language for Data-Parallel GPU Programming. In: VMV, pp. 241\u2013248 (2004)"},{"key":"1_CR8","doi-asserted-by":"publisher","first-page":"205","DOI":"10.1145\/1854273.1854302","volume-title":"PACT","author":"J. Gummaraju","year":"2010","unstructured":"Gummaraju, J., Morichetti, L., Houston, M., Sander, B., Gaster, B.R., Zheng, B.: Twin peaks: a software platform for heterogeneous computing on general-purpose and graphics processors. In: PACT, pp. 205\u2013216. ACM, New York (2010)"},{"key":"1_CR9","first-page":"285","volume-title":"ASPLOS","author":"A.H. Hormati","year":"2010","unstructured":"Hormati, A.H., Choi, Y., Woh, M., Kudlur, M., Rabbah, R., Mudge, T., Mahlke, S.: Macross: macro-simdization of streaming applications. In: ASPLOS, pp. 285\u2013296. ACM, New York (2010)"},{"key":"1_CR10","unstructured":"Intel: Intel OpenCL SDK 1.1 (June 2011)"},{"key":"1_CR11","doi-asserted-by":"crossref","unstructured":"Jaskelainen, P.O., de La Lama, C.S., Huerta, P., Takala, J.: OpenCL-based design methodology for application-specific processors. In: SAMOS 2010, pp. 223\u2013230 (July 2010)","DOI":"10.1109\/ICSAMOS.2010.5642061"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Karrenberg, R., Hack, S.: Whole Function Vectorization. In: CGO, pp. 141\u2013150 (2011)","DOI":"10.1109\/CGO.2011.5764682"},{"key":"1_CR13","unstructured":"Khronos Group: OpenCL 1.1 Specification (June 2011)"},{"key":"1_CR14","unstructured":"Lattner, C., Adve, V.: LLVM: A Compilation Framework for Lifelong Program Analysis & Transformation. In: CGO (March 2004)"},{"key":"1_CR15","doi-asserted-by":"crossref","unstructured":"Newburn, C.J., So, B., Liu, Z., McCool, M.D., Ghuloum, A.M., Toit, S.D., Wang, Z.G., Du, Z., Chen, Y., Wu, G., Guo, P., Liu, Z., Zhang, D.: Intel\u2019s Array Building Blocks: A retargetable, dynamic compiler and embedded language. In: CGO, pp. 224\u2013235 (2011)","DOI":"10.1109\/CGO.2011.5764690"},{"key":"1_CR16","unstructured":"Ngo, V.: Parallel loop transformation techniques for vector-based multiprocessor systems. Ph.D. thesis, University of Minnesota-Twin Cities (May 1994)"},{"key":"1_CR17","doi-asserted-by":"crossref","unstructured":"Nuzman, D., Henderson, R.: Multi-platform auto-vectorization. In: CGO, pp. 281\u2013294 (2006)","DOI":"10.1109\/CGO.2006.25"},{"key":"1_CR18","doi-asserted-by":"crossref","unstructured":"Nuzman, D., Zaks, A.: Outer-loop vectorization: revisited for short simd architectures. In: PACT, pp. 2\u201311. ACM (2008)","DOI":"10.1145\/1454115.1454119"},{"key":"1_CR19","unstructured":"NVIDIA: CUDA Programming Guide (2009)"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Parker, S., et al.: RTSL: A Ray Tracing Shading Language. In: IEEE Symposium on Interactive Ray Tracing (2007)","DOI":"10.1109\/RT.2007.4342603"},{"key":"1_CR21","unstructured":"Pharr, M.: Intel SPMD Program Compiler (June 2011)"},{"key":"1_CR22","doi-asserted-by":"crossref","unstructured":"Shin, J.: Introducing Control Flow into Vectorized Code. In: PACT, pp. 280\u2013291. IEEE Computer Society (2007)","DOI":"10.1109\/PACT.2007.4336219"},{"issue":"4","key":"1_CR23","doi-asserted-by":"publisher","first-page":"363","DOI":"10.1023\/A:1007559022013","volume":"28","author":"N. Sreraman","year":"2000","unstructured":"Sreraman, N., Govindarajan, R.: A vectorizing compiler for multimedia extensions. Int. J. Parallel Program.\u00a028(4), 363\u2013400 (2000)","journal-title":"Int. J. Parallel Program."},{"key":"1_CR24","unstructured":"Steckelmacher, D.: An OpenCL State Tracker for Gallium based on Clover (August 2011), http:\/\/people.freedesktop.org\/~steckdenis\/clover"},{"key":"1_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"16","DOI":"10.1007\/978-3-540-89740-8_2","volume-title":"Languages and Compilers for Parallel Computing","author":"J.A. Stratton","year":"2008","unstructured":"Stratton, J.A., Stone, S.S., Hwu, W.-m.W.: MCUDA: An Efficient Implementation of CUDA Kernels for Multi-core CPUs. In: Amaral, J.N. (ed.) LCPC 2008. LNCS, vol.\u00a05335, pp. 16\u201330. Springer, Heidelberg (2008)"},{"key":"1_CR26","unstructured":"The Portland Group, Inc.: PGI CUDA-x86 (June 2011)"},{"key":"1_CR27","unstructured":"Touati, S.A.A., Worms, J., Briais, S.: The Speedup Test. Rapport de recherche (2010), http:\/\/hal.inria.fr\/inria-00443839\/en\/"}],"container-title":["Lecture Notes in Computer Science","Compiler Construction"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-28652-0_1.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,23]],"date-time":"2025-03-23T18:55:25Z","timestamp":1742756125000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-28652-0_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012]]},"ISBN":["9783642286513","9783642286520"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-28652-0_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2012]]}}}