{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,7]],"date-time":"2024-09-07T05:05:07Z","timestamp":1725685507135},"publisher-location":"Berlin, Heidelberg","reference-count":26,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783642309601"},{"type":"electronic","value":"9783642309618"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2012]]},"DOI":"10.1007\/978-3-642-30961-8_9","type":"book-chapter","created":{"date-parts":[[2012,5,22]],"date-time":"2012-05-22T17:44:50Z","timestamp":1337708690000},"page":"116-129","source":"Crossref","is-referenced-by-count":4,"title":["A Compiler-Assisted Runtime-Prefetching Scheme for Heterogeneous Platforms"],"prefix":"10.1007","author":[{"given":"Li","family":"Chen","sequence":"first","affiliation":[]},{"given":"Baojiang","family":"Shou","sequence":"additional","affiliation":[]},{"given":"Xionghui","family":"Hou","sequence":"additional","affiliation":[]},{"given":"Lei","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"9_CR1","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1002\/cpe.1631","volume":"23","author":"C. Augonnet","year":"2011","unstructured":"Augonnet, C., Thibault, S., Namyst, R., Wacrenier, P.A.: Starpu: a unified platform for task scheduling on heterogeneous multicore architectures. Concurr. Comput.: Pract. Exper.\u00a023, 187\u2013198 (2011)","journal-title":"Concurr. Comput. : Pract. Exper."},{"key":"9_CR2","doi-asserted-by":"crossref","unstructured":"Barrachina, S., Castillo, M., Igual, F., Mayo, R., Quintana-Orti, E.: Evaluation and tuning of the level 3 cublas for graphics processors. In: IEEE International Symposium on Parallel and Distributed Processing, IPDPS 2008, pp. 1\u20138 (April 2008)","DOI":"10.1109\/IPDPS.2008.4536485"},{"key":"9_CR3","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1145\/1810479.1810498","volume-title":"Proceedings of the 22nd ACM Symposium on Parallelism in Algorithms and Architectures, SPAA 2010","author":"M. Becchi","year":"2010","unstructured":"Becchi, M., Byna, S., Cadambi, S., Chakradhar, S.: Data-aware scheduling of legacy kernels on heterogeneous platforms with distributed memory. In: Proceedings of the 22nd ACM Symposium on Parallelism in Algorithms and Architectures, SPAA 2010, pp. 82\u201391. ACM, New York (2010), \n                      \n                        http:\/\/doi.acm.org\/10.1145\/1810479.1810498"},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Che, S., Sheaffer, J.W., Skadron, K.: Dymaxion: optimizing memory access patterns for heterogeneous systems. In: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2011, pp. 13:1\u201313:11. ACM, New York (2011)","DOI":"10.1145\/2063384.2063401"},{"key":"9_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"286","DOI":"10.1007\/978-3-642-19861-8_16","volume-title":"Compiler Construction","author":"D. Grewe","year":"2011","unstructured":"Grewe, D., O\u2019Boyle, M.F.P.: A Static Task Partitioning Approach for Heterogeneous Systems Using OpenCL. In: Knoop, J. (ed.) CC 2011. LNCS, vol.\u00a06601, pp. 286\u2013305. Springer, Heidelberg (2011)"},{"key":"9_CR6","doi-asserted-by":"publisher","first-page":"299","DOI":"10.1145\/1375527.1375571","volume-title":"Proceedings of the 22nd Annual International Conference on Supercomputing, ICS 2008","author":"I. Gelado","year":"2008","unstructured":"Gelado, I., Kelm, J.H., Ryoo, S., Lumetta, S.S., Navarro, N., Hwu, W.M.W.: Cuba: an architecture for efficient cpu\/co-processor data communication. In: Proceedings of the 22nd Annual International Conference on Supercomputing, ICS 2008, pp. 299\u2013308. ACM, New York (2008)"},{"key":"9_CR7","unstructured":"Group, K.O.W.: The opencl specification (2011), \n                      \n                        http:\/\/www.khronos.org\/registry\/cl\/"},{"key":"9_CR8","doi-asserted-by":"publisher","first-page":"52","DOI":"10.1145\/1513895.1513902","volume-title":"GPGPU-2: Proceedings of 2nd Workshop on General Purpose Processing on Graphics Processing Units","author":"T.D. Han","year":"2009","unstructured":"Han, T.D., Abdelrahman, T.S.: \/hi\/cuda: a high-level directive-based language for gpu programming. In: GPGPU-2: Proceedings of 2nd Workshop on General Purpose Processing on Graphics Processing Units, pp. 52\u201361. ACM, New York (2009)"},{"key":"9_CR9","first-page":"142","volume-title":"Proceedings of the 32nd ACM SIGPLAN Conference on Programming Language Design and Implementation, PLDI 2011","author":"T.B. Jablin","year":"2011","unstructured":"Jablin, T.B., Prabhu, P., Jablin, J.A., Johnson, N.P., Beard, S.R., August, D.I.: Automatic cpu-gpu communication management and optimization. In: Proceedings of the 32nd ACM SIGPLAN Conference on Programming Language Design and Implementation, PLDI 2011, pp. 142\u2013151. ACM, New York (2011)"},{"key":"9_CR10","first-page":"277","volume-title":"Proceedings of the 16th ACM Symposium on Principles and Practice of Parallel Programming, PPoPP 2011","author":"J. Kim","year":"2011","unstructured":"Kim, J., Kim, H., Lee, J.H., Lee, J.: Achieving a single compute device image in opencl for multiple gpus. In: Proceedings of the 16th ACM Symposium on Principles and Practice of Parallel Programming, PPoPP 2011, pp. 277\u2013288. ACM, New York (2011)"},{"key":"9_CR11","doi-asserted-by":"publisher","first-page":"451","DOI":"10.1145\/1815961.1816021","volume-title":"Proceedings of the 37th Annual International Symposium on Computer Architecture, ISCA 2010","author":"V.W. Lee","year":"2010","unstructured":"Lee, V.W., Kim, C., Chhugani, J., Deisher, M., Kim, D., Nguyen, A.D., Satish, N., Smelyanskiy, M., Chennupaty, S., Hammarlund, P., Singhal, R., Dubey, P.: Debunking the 100x gpu vs. cpu myth: an evaluation of throughput computing on cpu and gpu. In: Proceedings of the 37th Annual International Symposium on Computer Architecture, ISCA 2010, pp. 451\u2013460. ACM, New York (2010)"},{"key":"9_CR12","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1145\/1346281.1346318","volume-title":"Proceedings of the 13th International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS XIII","author":"M.D. Linderman","year":"2008","unstructured":"Linderman, M.D., Collins, J.D., Wang, H., Meng, T.H.: Merge: a programming model for heterogeneous multi-core systems. In: Proceedings of the 13th International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS XIII, pp. 287\u2013296. ACM, New York (2008)"},{"key":"9_CR13","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1145\/1669112.1669121","volume-title":"Proceedings of the 42nd Annual IEEE\/ACM International Symposium on Microarchitecture, MICRO 42","author":"C.K. Luk","year":"2009","unstructured":"Luk, C.K., Hong, S., Kim, H.: Qilin: exploiting parallelism on heterogeneous multiprocessors with adaptive mapping. In: Proceedings of the 42nd Annual IEEE\/ACM International Symposium on Microarchitecture, MICRO 42, pp. 45\u201355. ACM, New York (2009)"},{"key":"9_CR14","doi-asserted-by":"publisher","first-page":"256","DOI":"10.1145\/1542275.1542313","volume-title":"Proceedings of the 23rd International Conference on Supercomputing, ICS 2009","author":"J. Meng","year":"2009","unstructured":"Meng, J., Skadron, K.: Performance modeling and automatic ghost zone optimization for iterative stencil loops on gpus. In: Proceedings of the 23rd International Conference on Supercomputing, ICS 2009, pp. 256\u2013265. ACM, New York (2009)"},{"key":"9_CR15","unstructured":"Org., O.S.: The openacc application programming interface (2011), \n                      \n                        http:\/\/www.openacc-standard.org\/Downloads\/OpenACC.1.0.pdf?attredirects=0&d=1"},{"key":"9_CR16","doi-asserted-by":"publisher","first-page":"284","DOI":"10.1177\/1094342009106195","volume":"23","author":"J. Planas","year":"2009","unstructured":"Planas, J., Badia, R.M., Ayguad\u00e9, E., Labarta, J.: Hierarchical task-based programming with starss. Int. J. High Perform. Comput. Appl.\u00a023, 284\u2013299 (2009), \n                      \n                        http:\/\/dl.acm.org\/citation.cfm?id=1572226.1572233","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"9_CR17","first-page":"152","volume-title":"Proceedings of the 32nd ACM SIGPLAN Conference on Programming Language Design and Implementation, PLDI 2011","author":"A. Prasad","year":"2011","unstructured":"Prasad, A., Anantpur, J., Govindarajan, R.: Automatic compilation of matlab programs for synergistic execution on heterogeneous processors. In: Proceedings of the 32nd ACM SIGPLAN Conference on Programming Language Design and Implementation, PLDI 2011, pp. 152\u2013163. ACM, New York (2011), \n                      \n                        http:\/\/doi.acm.org\/10.1145\/1993498.1993517"},{"key":"9_CR18","unstructured":"Strengert, M., M\u00fcller, C., Dachsbacher, C., Ertl, T.: Cudasa: Compute unified device and systems architecture. In: Favre, J.M., Ma, K.L. (eds.) EGPGV, pp. 49\u201356. Eurographics Association (2008)"},{"key":"9_CR19","doi-asserted-by":"publisher","first-page":"513","DOI":"10.1145\/1854273.1854336","volume-title":"Proceedings of the 19th International Conference on Parallel Architectures and Compilation Techniques, PACT 2010","author":"I.J. Sung","year":"2010","unstructured":"Sung, I.J., Stratton, J.A., Hwu, W.M.W.: Data layout transformation exploiting memory-level parallelism in structured grid many-core applications. In: Proceedings of the 19th International Conference on Parallel Architectures and Compilation Techniques, PACT 2010, pp. 513\u2013522. ACM, New York (2010)"},{"key":"9_CR20","unstructured":"The Portland Group: PGI Fortran & C Accelator Programming Model. White Paper (2010)"},{"key":"9_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/978-3-540-89740-8_1","volume-title":"Languages and Compilers for Parallel Computing","author":"S.-Z. Ueng","year":"2008","unstructured":"Ueng, S.-Z., Lathara, M., Baghsorkhi, S.S., Hwu, W.-m.W.: CUDA-Lite: Reducing GPU Programming Complexity. In: Amaral, J.N. (ed.) LCPC 2008. LNCS, vol.\u00a05335, pp. 1\u201315. Springer, Heidelberg (2008)"},{"key":"9_CR22","doi-asserted-by":"publisher","first-page":"244","DOI":"10.1145\/1542275.1542312","volume-title":"Proceedings of the 23rd International Conference on Supercomputing, ICS 2009","author":"S. Venkatasubramanian","year":"2009","unstructured":"Venkatasubramanian, S., Vuduc, R.W., none, n.: Tuned and wildly asynchronous stencil kernels for hybrid cpu\/gpu systems. In: Proceedings of the 23rd International Conference on Supercomputing, ICS 2009, pp. 244\u2013255. ACM, New York (2009)"},{"key":"9_CR23","doi-asserted-by":"publisher","first-page":"167","DOI":"10.1145\/1572769.1572796","volume-title":"Proceedings of the Conference on High Performance Graphics 2009, HPG 2009","author":"V. Vineet","year":"2009","unstructured":"Vineet, V., Harish, P., Patidar, S., Narayanan, P.J.: Fast minimum spanning tree for large graphs on the gpu. In: Proceedings of the Conference on High Performance Graphics 2009, HPG 2009, pp. 167\u2013171. ACM, New York (2009)"},{"key":"9_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1007\/978-3-642-21487-5_8","volume-title":"OpenMP in the Petascale Era","author":"L. White","year":"2011","unstructured":"White, L.: OpenMP Extensions for Heterogeneous Architectures. In: Chapman, B.M., Gropp, W.D., Kumaran, K., M\u00fcller, M.S. (eds.) IWOMP 2011. LNCS, vol.\u00a06665, pp. 94\u2013107. Springer, Heidelberg (2011), \n                      \n                        http:\/\/dl.acm.org\/citation.cfm?id=2023025.2023036"},{"key":"9_CR25","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1109\/CLUSTER.2010.12","volume-title":"Proceedings of the 2010 IEEE International Conference on Cluster Computing, CLUSTER 2010","author":"C. Yang","year":"2010","unstructured":"Yang, C., Wang, F., Du, Y., Chen, J., Liu, J., Yi, H., Lu, K.: Adaptive optimization for petascale heterogeneous cpu\/gpu computing. In: Proceedings of the 2010 IEEE International Conference on Cluster Computing, CLUSTER 2010, pp. 19\u201328. IEEE Computer Society, Washington, DC (2010)"},{"key":"9_CR26","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1145\/1806596.1806606","volume-title":"Proceedings of the 2010 ACM SIGPLAN Conference on Programming Language Design and Implementation, PLDI 2010","author":"Y. Yang","year":"2010","unstructured":"Yang, Y., Xiang, P., Kong, J., Zhou, H.: A gpgpu compiler for memory optimization and parallelism management. In: Proceedings of the 2010 ACM SIGPLAN Conference on Programming Language Design and Implementation, PLDI 2010, pp. 86\u201397. ACM, New York (2010)"}],"container-title":["Lecture Notes in Computer Science","OpenMP in a Heterogeneous World"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-30961-8_9.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,5,4]],"date-time":"2021-05-04T11:36:44Z","timestamp":1620128204000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-30961-8_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012]]},"ISBN":["9783642309601","9783642309618"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-30961-8_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2012]]}}}