{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T14:04:01Z","timestamp":1766066641670,"version":"3.41.0"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2016,10,5]],"date-time":"2016-10-05T00:00:00Z","timestamp":1475625600000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2017,6]]},"DOI":"10.1007\/s10766-016-0454-1","type":"journal-article","created":{"date-parts":[[2016,10,5]],"date-time":"2016-10-05T07:15:40Z","timestamp":1475651740000},"page":"711-729","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":15,"title":["Panda: A Compiler Framework for Concurrent CPU $$+$$ + GPU Execution of 3D Stencil Computations on GPU-accelerated Supercomputers"],"prefix":"10.1007","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1231-6355","authenticated-orcid":false,"given":"Mohammed","family":"Sourouri","sequence":"first","affiliation":[]},{"given":"Scott B.","family":"Baden","sequence":"additional","affiliation":[]},{"given":"Xing","family":"Cai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,10,5]]},"reference":[{"key":"454_CR1","doi-asserted-by":"crossref","unstructured":"Ang, J., Barrett, R., Benner, R., Burke, D., Chan, C., Cook, J., Donofrio, D., Hammond, S., Hemmert, K., Kelly, S., Le, H., Leung, V., Resnick, D., Rodrigues, A., Shalf, J., Stark, D., Unat, D., Wright, N.: Abstract machine models and proxy architectures for exascale computing. In: Proceedings of the 1st International Workshop on Hardware\u2013Software Co-Design for High Performance Computing (Co-HPC), pp. 25\u201332 (2014)","DOI":"10.1109\/Co-HPC.2014.4"},{"key":"454_CR2","doi-asserted-by":"crossref","unstructured":"Baskaran, M.M., Ramanujam, J., Sadayappan, P.: Automatic C-to-CUDA code generation for affine programs. In: Proceedings of the 19th Joint European Conference on Theory and Practice of Software, International Conference on Compiler Construction, pp. 244\u2013263 (2010)","DOI":"10.1007\/978-3-642-11970-5_14"},{"key":"454_CR3","doi-asserted-by":"crossref","unstructured":"Basumallik, A., Eigenmann, R.: Towards automatic translation of OpenMP to MPI. In: Proceedings of the 19th Annual International Conference on Supercomputing, pp. 189\u2013198 (2005)","DOI":"10.1145\/1088149.1088174"},{"key":"454_CR4","doi-asserted-by":"crossref","unstructured":"Christen, M., Schenk, O., Burkhart, B.: PATUS: A code generation and autotuning framework for parallel iterative stencil computations on modern microarchitectures. In: Parallel Distributed Processing Symposium (IPDPS), 2011 IEEE International, pp. 676\u2013687 (2011)","DOI":"10.1109\/IPDPS.2011.70"},{"key":"454_CR5","doi-asserted-by":"crossref","unstructured":"Dathathri, R., Reddy, C., Ramashekar, T., Bondhugula, U.: Generating efficient data movement code for heterogeneous architectures with distributed-memory. In: Proceedings of the 22nd International Conference on Parallel Architectures and Compilation Techniques, pp. 375\u2013386 (2013)","DOI":"10.1109\/PACT.2013.6618826"},{"key":"454_CR6","doi-asserted-by":"crossref","unstructured":"Grosser, T., Cohen, A., Holewinski, J., Sadayappan, P., Verdoolaege, S.: Hybrid hexagonal\/classical tiling for GPUs. In: Proceedings of Annual IEEE\/ACM International Symposium on Code Generation and Optimization, pp. 66:66\u201366:75 (2014)","DOI":"10.1145\/2581122.2544160"},{"key":"454_CR7","doi-asserted-by":"crossref","unstructured":"Gysi, T., Osuna, C., Fuhrer, O., Bianco, M., Schulthess, T.C.: STELLA: A domain-specific tool for structured grid methods in weather and climate models. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 41:1\u201341:12 (2015)","DOI":"10.1145\/2807591.2807627"},{"key":"454_CR8","first-page":"427","volume":"8","author":"M Hanslien","year":"2011","unstructured":"Hanslien, M., Artebrant, R., Tveito, A., Lines, G.T., Cai, X.: Stability of two time-integrators for the Aliev-Panfilov system. Int. J. Numer. Anal. Model. 8, 427\u2013442 (2011)","journal-title":"Int. J. Numer. Anal. Model."},{"key":"454_CR9","doi-asserted-by":"crossref","unstructured":"Holewinski, J., Pouchet, L.N., Sadayappan, P.: High-performance code generation for stencil computations on GPU architectures. In: Proceedings of the 26th ACM International Conference on Supercomputing, pp. 311\u2013320 (2012)","DOI":"10.1145\/2304576.2304619"},{"key":"454_CR10","doi-asserted-by":"crossref","unstructured":"Kamil, S., Chan, C., Oliker, L., Shalf, J., Williams, S.: An auto-tuning framework for parallel multicore stencil computations. In: Parallel Distributed Processing (IPDPS), 2010 IEEE International Symposium on, pp. 1\u201312 (2010)","DOI":"10.1109\/IPDPS.2010.5470421"},{"key":"454_CR11","doi-asserted-by":"crossref","unstructured":"Kim, J., Seo, S., Lee, J., Nah, J., Jo, G., Lee, J.: SnuCL: An OpenCL framework for heterogeneous CPU\/GPU clusters. In: Proceedings of the 26th ACM International Conference on Supercomputing, pp. 341\u2013352 (2012)","DOI":"10.1145\/2304576.2304623"},{"issue":"4","key":"454_CR12","doi-asserted-by":"crossref","first-page":"6","DOI":"10.1109\/MM.2015.70","volume":"35","author":"J Langguth","year":"2015","unstructured":"Langguth, J., Sourouri, M., Lines, G.T., Baden, S.B., Cai, X.: Scalable heterogeneous CPU\u2013GPU computations for unstructured tetrahedral meshes. Micro, IEEE 35(4), 6\u201315 (2015)","journal-title":"Micro, IEEE"},{"key":"454_CR13","unstructured":"Lawrence Livermore National Laboratory: ROSE compiler infrastructure. http:\/\/rosecompiler.org (2015). Accessed 04 June 2015"},{"key":"454_CR14","doi-asserted-by":"crossref","unstructured":"Lee, S., Eigenmann, R.: OpenMPC: Extended OpenMP programming and tuning for GPUs. In: Proceedings of the 2010 ACM\/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201311 (2010)","DOI":"10.1109\/SC.2010.36"},{"key":"454_CR15","doi-asserted-by":"crossref","unstructured":"Lee, S., Vetter, J.S.: Early evaluation of directive-based GPU programming models for productive exascale computing. In: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, pp. 23:1\u201323:11 (2012)","DOI":"10.1109\/SC.2012.51"},{"key":"454_CR16","doi-asserted-by":"crossref","unstructured":"Levesque, J.M., Sankaran, R., Grout, R.: Hybridizing S3D into an exascale application using OpenACC: An approach for moving to multi-petaflops and beyond. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 15:1\u201315:11 (2012)","DOI":"10.1109\/SC.2012.69"},{"issue":"4","key":"454_CR17","doi-asserted-by":"crossref","first-page":"59:1","DOI":"10.1145\/2400682.2400718","volume":"9","author":"T Lutz","year":"2013","unstructured":"Lutz, T., Fensch, C., Cole, M.: PARTANS: an autotuning framework for stencil computation on multi-GPU systems. ACM Trans. Archit. Code Optim. 9(4), 59:1\u201359:24 (2013)","journal-title":"ACM Trans. Archit. Code Optim."},{"key":"454_CR18","unstructured":"Mark Harris: CUDA pro tip: Write flexible kernels with grid-stride loops. http:\/\/goo.gl\/b8Vmkh (2015). Accessed 12 Nov 2015"},{"key":"454_CR19","unstructured":"Maruyama, N., Aoki, T.: Optimizing stencil computations for NVIDIA Kepler GPUs. In: Proceedings of the 1st International Workshop on High-Performance Stencil Computations, pp. 89\u201395 (2014)"},{"key":"454_CR20","doi-asserted-by":"crossref","unstructured":"Maruyama, N., Nomura, T., Sato, K., Matsuoka, S.: Physis: An implicitly parallel programming model for stencil computations on large-scale GPU-accelerated supercomputers. In: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 11:1\u201311:12 (2011)","DOI":"10.1145\/2063384.2063398"},{"key":"454_CR21","unstructured":"McCalpin, J.D.: Memory bandwidth and machine balance in current high performance computers. IEEE Computer Society Technical Committee on Computer Architecture (TCCA) Newsletter pp. 19\u201325 (1995)"},{"key":"454_CR22","doi-asserted-by":"crossref","unstructured":"Mittal, S., Vetter, J.S.: A survey of CPU\u2013GPU heterogeneous computing techniques. ACM Comput. Surv. 47(4) (2015)","DOI":"10.1145\/2788396"},{"key":"454_CR23","unstructured":"NVIDIA: NVIDIA\u2019s next generation CUDA compute architecture: Kepler GK110. http:\/\/goo.gl\/9ju84x (2013). Accessed 12 Nov 2015"},{"key":"454_CR24","doi-asserted-by":"crossref","unstructured":"Olschanowsky, C., Strout, M.M., Guzik, S., Loffeld, J., Hittinger, J.: A study on balancing parallelism, data locality, and recomputation in existing PDE solvers. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 793\u2013804 (2014)","DOI":"10.1109\/SC.2014.70"},{"key":"454_CR25","unstructured":"OpenACC - Directives for Accelerators: The OpenACC Application Program Interface. http:\/\/openacc-standard.org (2015). Accessed 23 May 2015"},{"key":"454_CR26","unstructured":"OpenMP Architecture Review Board: OpenMP Application Program Interface. http:\/\/openmp.org (2015). Accessed 23 May 2015"},{"key":"454_CR27","doi-asserted-by":"crossref","unstructured":"Ragan-Kelley, J., Barnes, C., Adams, A., Paris, S., Durand, F., Amarasinghe, S.: Halide: A language and compiler for optimizing parallelism, locality, and recomputation in image processing pipelines. In: Proceedings of the 34th ACM SIGPLAN Conference on Programming Language Design and Implementation, pp. 519\u2013530 (2013)","DOI":"10.1145\/2491956.2462176"},{"key":"454_CR28","doi-asserted-by":"crossref","unstructured":"Rahman, S.M.F., Yi, Q., Qasem, A.: Understanding stencil code performance on multicore architectures. In: Proceedings of the 8th ACM International Conference on Computing Frontiers, pp. 30:1\u201330:10 (2011)","DOI":"10.1145\/2016604.2016641"},{"key":"454_CR29","doi-asserted-by":"crossref","unstructured":"Ravishankar, M., Dathathri, R., Elango, V., Pouchet, L.N., Ramanujam, J., Rountev, A., Sadayappan, P.: Distributed memory code generation for mixed irregular\/regular computations. In: Proceedings of the 20th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP 2015, pp. 65\u201375 (2015)","DOI":"10.1145\/2688500.2688515"},{"key":"454_CR30","doi-asserted-by":"crossref","unstructured":"Sch\u00e4fer, A., Fey, D.: High performance stencil code algorithms for GPGPUs. In: Proceedings of 2011 International Conference on Computational Sciences (ICCS) 4, 2027\u20132036 (2011)","DOI":"10.1016\/j.procs.2011.04.221"},{"key":"454_CR31","doi-asserted-by":"crossref","unstructured":"Shimokawabe, T., Aoki, T., Onodera, N.: High-productivity framework on GPU-rich supercomputers for operational weather prediction code ASUCA. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 251\u2013261 (2014)","DOI":"10.1109\/SC.2014.26"},{"key":"454_CR32","doi-asserted-by":"crossref","unstructured":"Shimokawabe, T., Aoki, T., Takaki, T., Endo, T., Yamanaka, A., Maruyama, N., Nukada, A., Matsuoka, S.: Peta-scale phase-field simulation for dendritic solidification on the TSUBAME 2.0 supercomputer. In: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 3:1\u20133:11 (2011)","DOI":"10.1145\/2063384.2063388"},{"key":"454_CR33","doi-asserted-by":"crossref","unstructured":"Sourouri, M., Langguth, J., Spiga, F., Baden, S.B., Cai, X.: CPU+GPU programming of stencil computations for resource-efficient use of GPU clusters. In: Computational Science and Engineering (CSE), 2015 IEEE 18th International Conference on, pp. 17\u201326 (2015)","DOI":"10.1109\/CSE.2015.33"},{"key":"454_CR34","doi-asserted-by":"crossref","unstructured":"Su, H., Wu, N., Wen, M., Zhang, C., Cai, X.: On the GPU performance of 3D stencil computations implemented in OpenCL. In: Proceedings of the 28th International Supercomputing Conference 7905, 125\u2013135 (2013)","DOI":"10.1007\/978-3-642-38750-0_10"},{"key":"454_CR35","unstructured":"Top500.org: June 2015\u2014the green500 list. http:\/\/www.green500.org\/lists\/green201506 (2015). Accessed 04 Sept 2015"},{"key":"454_CR36","unstructured":"Top500.org: November 2015\u2014top500 supercomputer sites. http:\/\/top500.org\/lists\/2015\/11\/ (2015). Accessed 18 Nov 2015"},{"key":"454_CR37","doi-asserted-by":"crossref","unstructured":"Unat, D., Cai, X., Baden, S.B.: Mint: Realizing CUDA performance in 3D stencil methods with annotated C. In: Proceedings of the International Conference on Supercomputing, pp. 214\u2013224 (2011)","DOI":"10.1145\/1995896.1995932"},{"key":"454_CR38","doi-asserted-by":"crossref","unstructured":"Venkatasubramanian, S., Vuduc, R.W.: Tuned and wildly asynchronous stencil kernels for hybrid CPU\/GPU systems. In: Proceedings of the 23rd International Conference on Supercomputing, pp. 244\u2013255 (2009)","DOI":"10.1145\/1542275.1542312"},{"key":"454_CR39","doi-asserted-by":"crossref","unstructured":"Wienke, S., Springer, P., Terboven, C., Mey, D.: OpenACC - First Experiences with Real-World Applications. In: Euro-Par 2012 Parallel Processing\u201418th International Conference, vol. 7484, pp. 859\u2013870 (2012)","DOI":"10.1007\/978-3-642-32820-6_85"},{"key":"454_CR40","doi-asserted-by":"crossref","unstructured":"Williams, S., Kalamkar, D.D., Singh, A., Deshpande, A.M., Van\u00a0Straalen, B., Smelyanskiy, M., Almgren, A., Dubey, P., Shalf, J., Oliker, L.: Optimization of geometric multigrid for emerging multi- and manycore processors. In: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, pp. 96:1\u201396:11 (2012)","DOI":"10.1109\/SC.2012.85"},{"issue":"4","key":"454_CR41","doi-asserted-by":"crossref","first-page":"65","DOI":"10.1145\/1498765.1498785","volume":"52","author":"S Williams","year":"2009","unstructured":"Williams, S., Waterman, A., Patterson, D.: Roofline: An insightful visual performance model for multicore architectures. Commun. ACM 52(4), 65\u201376 (2009)","journal-title":"Commun. ACM"},{"key":"454_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Mueller, F.: Auto-generation and auto-tuning of 3D stencil codes on GPU clusters. In: Proceedings of the Tenth International Symposium on Code Generation and Optimization, pp. 155\u2013164 (2012)","DOI":"10.1145\/2259016.2259037"}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-016-0454-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10766-016-0454-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-016-0454-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T23:54:31Z","timestamp":1749599671000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10766-016-0454-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,10,5]]},"references-count":42,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2017,6]]}},"alternative-id":["454"],"URL":"https:\/\/doi.org\/10.1007\/s10766-016-0454-1","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"type":"print","value":"0885-7458"},{"type":"electronic","value":"1573-7640"}],"subject":[],"published":{"date-parts":[[2016,10,5]]}}}