{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T16:11:05Z","timestamp":1781194265483,"version":"3.54.1"},"reference-count":37,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2010,6,30]],"date-time":"2010-06-30T00:00:00Z","timestamp":1277856000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2011,2]]},"DOI":"10.1007\/s10766-010-0142-5","type":"journal-article","created":{"date-parts":[[2010,6,29]],"date-time":"2010-06-29T12:06:29Z","timestamp":1277813189000},"page":"115-142","source":"Crossref","is-referenced-by-count":36,"title":["A Performance Study for Iterative Stencil Loops on GPUs with Ghost Zone Optimizations"],"prefix":"10.1007","volume":"39","author":[{"given":"Jiayuan","family":"Meng","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kevin","family":"Skadron","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2010,6,30]]},"reference":[{"key":"142_CR1","doi-asserted-by":"crossref","unstructured":"Allen, G., Dramlitsch, T., Foster, I., Karonis, N.T., Ripeanu, M., Seidel, E., Toonen, B.: Supporting efficient execution in heterogeneous distributed computing environments with cactus and globus. In: SC\u201901, pp. 52\u201352 (2001)","DOI":"10.1145\/582034.582086"},{"key":"142_CR2","doi-asserted-by":"crossref","unstructured":"Alpert, M.: Not just fun and games. April (1999)","DOI":"10.1038\/scientificamerican0499-40"},{"issue":"6","key":"142_CR3","doi-asserted-by":"crossref","first-page":"145","DOI":"10.1145\/113445.113458","volume":"26","author":"M. Bromley","year":"1991","unstructured":"Bromley M., Heller S., McNerney T., Steele G.L. Jr: Fortran at ten gigaflops: the connection machine convolution compiler. PLDI \u201991 26(6), 145\u2013156 (1991)","journal-title":"PLDI \u201991"},{"key":"142_CR4","doi-asserted-by":"crossref","unstructured":"Chatterjee, S., Gilbert, J.R., Schreiber, R.: Mobile and replicated alignment of arrays in data-parallel programs. In: SC\u201993, pp. 420\u2013429 November (1993)","DOI":"10.1145\/169627.169764"},{"key":"142_CR5","doi-asserted-by":"crossref","unstructured":"Che, S., Boyer, M., Meng, J., Tarjan, D., Sheaffer, J.W., Skadron, K.: A performance study of general purpose applications on graphics processors using CUDA, June (2008)","DOI":"10.1016\/j.jpdc.2008.05.014"},{"key":"142_CR6","unstructured":"NVIDIA Corporation. Geforce gtx 280 specifications. (2008)"},{"key":"142_CR7","unstructured":"NVIDIA Corporation. NVIDIA CUDA visual profiler. June (2008)"},{"key":"142_CR8","doi-asserted-by":"crossref","unstructured":"Dagum, L.: OpenMP: a proposed industry standard API for shared memory programming, October (1997)","DOI":"10.1109\/99.660313"},{"key":"142_CR9","doi-asserted-by":"crossref","unstructured":"Datta, K., Murphy, M., Volkov, V., Williams, S., Carter, J., Oliker, L., Patterson, D., Shalf, J., Yelick, K.: Stencil computation optimization and auto-tuning on state-of-the-art multicore architectures. In: SC \u201908. 1\u201312 (2008)","DOI":"10.1109\/SC.2008.5222004"},{"key":"142_CR10","doi-asserted-by":"crossref","unstructured":"Deitz, S.J., Chamberlain, B.L., Snyder, L.: Eliminating redundancies in sum-of-product array computations. In: ICS \u201901, pp. 65\u201377 (2001)","DOI":"10.1145\/377792.377807"},{"key":"142_CR11","unstructured":"Evans, L.C.: Partial differential equations. Am. Math. Soc. (1998)"},{"key":"142_CR12","unstructured":"Chen, L., Zhang, Z.-Q., Feng, X.-B.: Redundant computation partition on distributed-memory systems. In: ICA3PP \u201902, pp. 252 (2002)"},{"key":"142_CR13","doi-asserted-by":"crossref","unstructured":"Frigo, M., Strumpen, V.: Cache oblivious stencil computations. In: ICS\u201905, pp. 361\u2013366 (2005)","DOI":"10.1145\/1088149.1088197"},{"key":"142_CR14","unstructured":"Goodnight, N.: CUDA\/OpenGL fluid simulation. April (2007)"},{"key":"142_CR15","doi-asserted-by":"crossref","unstructured":"Gschwind, M.: Chip multiprocessing and the cell broadband engine. In: CF\u201906 (2006)","DOI":"10.1145\/1128022.1128023"},{"issue":"2","key":"142_CR16","doi-asserted-by":"crossref","first-page":"90","DOI":"10.1006\/jpdc.1993.1094","volume":"19","author":"C.-H. Huang","year":"1993","unstructured":"Huang C.-H., Sadayappan P.: Communication-free hyperplane partitioning of nested loops. J. Parallel Distrib. Comput. 19(2), 90\u2013102 (1993)","journal-title":"J. Parallel Distrib. Comput."},{"key":"142_CR17","doi-asserted-by":"crossref","unstructured":"Huang, W., Stan, M.R., Skadron, K., Ghosh, S., Sankaranarayanan, K., Velusamy, S.: Compact thermal modeling for temperature-aware design. In: DAC\u201904. (2004)","DOI":"10.1145\/996566.996800"},{"key":"142_CR18","unstructured":"Electronic Educational Devices Inc. Watts up? electricity meter operator\u2019s manual. (2002)"},{"key":"142_CR19","unstructured":"Jalby, W., Meier, U.: Optimizing matrix operations on a parallel multiprocessor with a hierarchical memory system, pp. 429\u2013432 (1986)"},{"key":"142_CR20","doi-asserted-by":"crossref","unstructured":"Kamil, S., Husbands, P., Oliker, L., Shalf, J., Yelick, K.: Impact of modern memory subsystems on cache optimizations for stencil computations. In: MSP\u201905, pp. 36\u201343 (2005)","DOI":"10.1145\/1111583.1111589"},{"issue":"4","key":"142_CR21","doi-asserted-by":"crossref","first-page":"381","DOI":"10.1007\/s006070070032","volume":"64","author":"M. Kowarschik","year":"2000","unstructured":"Kowarschik M., Wei\u00df C., Karl W., R\u00fcde U.: Cache-aware multigrid methods for solving poisson\u2019s equation in two dimensions. Computing 64(4), 381\u2013399 (2000)","journal-title":"Computing"},{"issue":"6","key":"142_CR22","doi-asserted-by":"crossref","first-page":"235","DOI":"10.1145\/1250734.1250761","volume":"42","author":"S. Krishnamoorthy","year":"2007","unstructured":"Krishnamoorthy S., Baskaran M., Bondhugula U., Ramanujam J., Rountev A., Sadayappan P.: Effective automatic parallelization of stencil computations. PLDI \u201907 42(6), 235\u2013244 (2007)","journal-title":"PLDI \u201907"},{"key":"142_CR23","doi-asserted-by":"crossref","first-page":"1895","DOI":"10.1016\/0167-8191(95)00052-6","volume":"21","author":"P. Lee","year":"1995","unstructured":"Lee P.: Techniques for compiling programs on distributed memory multicomputers. Parallel Comput. 21, 1895\u20131923 (1995)","journal-title":"Parallel Comput."},{"issue":"6","key":"142_CR24","doi-asserted-by":"crossref","first-page":"975","DOI":"10.1145\/1034774.1034777","volume":"26","author":"Z. Li","year":"2004","unstructured":"Li Z., Song Y.: Automatic tiling of iterative stencil loops. ACM Trans. Program. Lang. Syst. 26(6), 975\u20131028 (2004)","journal-title":"ACM Trans. Program. Lang. Syst."},{"key":"142_CR25","first-page":"19","volume":"8","author":"N. Manjikian","year":"1997","unstructured":"Manjikian N., Abdelrahman T.S.: Fusion of loops for parallelism and locality. Parallel Distrib. Syst. 8, 19\u201328 (1997)","journal-title":"Parallel Distrib. Syst."},{"key":"142_CR26","doi-asserted-by":"crossref","unstructured":"Meng, J., Skadron, K.: Performance modeling and automatic ghost zone optimization for iterative stencil loops on gpus. In: ICS \u201909, pp. 256\u2013265 (2009)","DOI":"10.1145\/1542275.1542313"},{"issue":"2","key":"142_CR27","doi-asserted-by":"crossref","first-page":"40","DOI":"10.1145\/1365490.1365500","volume":"6","author":"J. Nickolls","year":"2008","unstructured":"Nickolls J., Buck I., Garland M., Skadron K.: Scalable parallel programming with CUDA. Queue 6(2), 40\u201353 (2008)","journal-title":"Queue"},{"issue":"2","key":"142_CR28","doi-asserted-by":"crossref","first-page":"539","DOI":"10.1016\/j.jcp.2006.10.023","volume":"224","author":"K.N. Premnath","year":"2007","unstructured":"Premnath K.N., Abraham J.: Three-dimensional multi-relaxation time (mrt) lattice-Boltzmann models for multiphase flow. J. Comput. Phys. 224(2), 539\u2013559 (2007)","journal-title":"J. Comput. Phys."},{"key":"142_CR29","unstructured":"Ramanujam, J.: Tiling of iteration spaces for multicomputers. In: Proceedings International Conference Parallel Processing, pp. 179\u2013186. (1990)"},{"key":"142_CR30","doi-asserted-by":"crossref","unstructured":"Renganarayana, L., Harthikote-Matha, M., Dewri, R., Rajopadhye, S.: Towards optimal multi-level tiling for stencil computations. IPDPS\u201907, pp. 1\u201310, March (2007)","DOI":"10.1109\/IPDPS.2007.370291"},{"key":"142_CR31","doi-asserted-by":"crossref","unstructured":"Renganarayana, L., Rajopadhye, S.: Positivity, posynomials and tile size selection. In: SC \u201908, pp. 1\u201312 (2008)","DOI":"10.1109\/SC.2008.5213293"},{"key":"142_CR32","doi-asserted-by":"crossref","unstructured":"Ripeanu, M., Iamnitchi, A., Foster, I.: Cactus application: Performance predictions in a grid environment. In: EuroPar\u201901. (2001)","DOI":"10.1007\/3-540-44681-8_114"},{"key":"142_CR33","doi-asserted-by":"crossref","unstructured":"Rivera G., Tseng, C.-W.: Tiling optimizations for 3D scientific computations. In: SC \u201900, p. 32 (2000)","DOI":"10.1109\/SC.2000.10015"},{"key":"142_CR34","doi-asserted-by":"crossref","unstructured":"Ueng, S.-Z., Baghsorkhi, S., Lathara, M., Hwu, W.m.: CUDA-lite: Reducing GPU programming complexity. In: LCPC\u201908. (2008)","DOI":"10.1007\/978-3-540-89740-8_1"},{"key":"142_CR35","doi-asserted-by":"crossref","unstructured":"Wonnacott, D.: Time skewing for parallel computers. In: WLCPC\u201999, pp. 477\u2013480 (1999)","DOI":"10.1007\/3-540-44905-1_35"},{"issue":"3","key":"142_CR36","doi-asserted-by":"crossref","first-page":"181","DOI":"10.1023\/A:1015460304860","volume":"30","author":"D. Wonnacott","year":"2002","unstructured":"Wonnacott D.: Achieving scalable locality with time skewing. Int. J. Parallel Program. 30(3), 181\u2013221 (2002)","journal-title":"Int. J. Parallel Program."},{"key":"142_CR37","doi-asserted-by":"crossref","first-page":"198","DOI":"10.1109\/CSSE.2008.1448","volume":"3","author":"Z. Yang","year":"2008","unstructured":"Yang Z., Zhu Y., Pu Y.: Parallel image processing based on CUDA. Int. Conf. Comput. Sci. Software Eng. 3, 198\u2013201 (2008)","journal-title":"Int. Conf. Comput. Sci. Software Eng."}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-010-0142-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10766-010-0142-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-010-0142-5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,10,30]],"date-time":"2021-10-30T09:11:04Z","timestamp":1635585064000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10766-010-0142-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2010,6,30]]},"references-count":37,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2011,2]]}},"alternative-id":["142"],"URL":"https:\/\/doi.org\/10.1007\/s10766-010-0142-5","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"value":"0885-7458","type":"print"},{"value":"1573-7640","type":"electronic"}],"subject":[],"published":{"date-parts":[[2010,6,30]]}}}