{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:38:41Z","timestamp":1740123521280,"version":"3.37.3"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2019,1,10]],"date-time":"2019-01-10T00:00:00Z","timestamp":1547078400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"The National Key Research and Development Program of China","award":["2017YFB0202002"],"award-info":[{"award-number":["2017YFB0202002"]}]},{"DOI":"10.13039\/501100001809","name":"The National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["61572394"],"award-info":[{"award-number":["61572394"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2019,7]]},"DOI":"10.1007\/s11227-019-02749-1","type":"journal-article","created":{"date-parts":[[2019,1,9]],"date-time":"2019-01-09T23:14:09Z","timestamp":1547075649000},"page":"3810-3841","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["NoT: a high-level no-threading parallel programming method for heterogeneous systems"],"prefix":"10.1007","volume":"75","author":[{"given":"Shusen","family":"Wu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoshe","family":"Dong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingjun","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhengdong","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,1,10]]},"reference":[{"key":"2749_CR1","unstructured":"The CUDA Toolkit. \n                    https:\/\/developer.nvidia.com\/cuda-toolkit\n                    \n                  . Accessed 10 May 2018"},{"key":"2749_CR2","unstructured":"The OpenCL standard. \n                    https:\/\/www.khronos.org\/opencl\/\n                    \n                  . Accessed 10 May 2018"},{"key":"2749_CR3","doi-asserted-by":"crossref","unstructured":"Ryoo S, Rodrigues CI, Baghsorkhi SS, Stone SS, Kirk DB, Hwu WW(2008) Optimization principles and application performance evaluation of a multithreaded GPU using CUDA. In: Proceedings of the 13th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP\u201908, pp 73\u201382","DOI":"10.1145\/1345206.1345220"},{"key":"2749_CR4","unstructured":"Alberto M, Christophe D, Michael OB (2014) Automatic optimization of thread-coarsening for graphics processors. In: Proceedings of the 23rd International Conference on Parallel Architectures and Compilation, PACT\u201914, pp 455\u2013466"},{"key":"2749_CR5","doi-asserted-by":"crossref","unstructured":"Luk CK, Hong S, Kim H (2009) Qilin: exploiting parallelism on heterogeneous multiprocessors with adaptive mapping. In: Proceedings of the 42nd Annual IEEE\/ACM International Symposium on Microarchitecture, MICRO 42, pp 45\u201355","DOI":"10.1145\/1669112.1669121"},{"issue":"1","key":"2749_CR6","doi-asserted-by":"publisher","first-page":"78","DOI":"10.1109\/TPDS.2010.62","volume":"22","author":"TD Han","year":"2011","unstructured":"Han TD, Abdelrahman TS (2011) hiCUDA: high-level GPGPU programming. IEEE Trans Parallel Distrib Syst 22(1):78\u201390","journal-title":"IEEE Trans Parallel Distrib Syst"},{"issue":"4","key":"2749_CR7","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/2744202","volume":"11","author":"Z Wang","year":"2015","unstructured":"Wang Z, Grewe D, O\u2019boyle MFP (2015) Automatic and portable mapping of data parallel programs to OpenCL for GPU-based heterogeneous systems. ACM Trans Archit Code Optim 11(4):1\u201326","journal-title":"ACM Trans Archit Code Optim"},{"key":"2749_CR8","unstructured":"The OpenACC Homepage. \n                    https:\/\/www.openacc.org\/\n                    \n                  . Accessed 10 May 2018"},{"key":"2749_CR9","unstructured":"High Performance Fortran Forum. \n                    http:\/\/hpff.rice.edu\/\n                    \n                  . Accessed 10 May 2018"},{"issue":"3","key":"2749_CR10","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1177\/1094342007078442","volume":"21","author":"BL Chamberlain","year":"2007","unstructured":"Chamberlain BL, Callahan D, Zima HP (2007) Parallel programmability and the Chapel language. Int J High Perform Comput Appl 21(3):291\u2013312","journal-title":"Int J High Perform Comput Appl"},{"key":"2749_CR11","unstructured":"C++ Accelerated Massive Parallelism. \n                    https:\/\/msdn.microsoft.com\/en-us\/library\/hh265137.aspx\n                    \n                  . Accessed 10 May 2018"},{"issue":"1","key":"2749_CR12","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1145\/1327452.1327492","volume":"51","author":"J Dean","year":"2008","unstructured":"Dean J, Ghemawat S (2008) MapReduce: simplified data processing on large clusters. Commun ACM 51(1):107\u2013113","journal-title":"Commun ACM"},{"issue":"8","key":"2749_CR13","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1145\/2038037.1941562","volume":"46","author":"B Catanzaro","year":"2011","unstructured":"Catanzaro B, Garland M, Keutzer K (2011) Copperhead: compiling an embedded data parallel language. ACM SIGPLAN Not 46(8):47\u201356","journal-title":"ACM SIGPLAN Not"},{"key":"2749_CR14","unstructured":"Zhang Y, Mueller F (2013) Hidp: a hierarchical data parallel language. In: Proceedings of the 2013 IEEE\/ACM International Symposium on Code Generation and Optimization, CGO\u201913, pp 1\u201311"},{"key":"2749_CR15","unstructured":"High-Performance Portable MPI. \n                    http:\/\/www.mpich.org\/\n                    \n                  . Accessed 10 May 2018"},{"key":"2749_CR16","unstructured":"The OpenMP API specification. \n                    http:\/\/www.openmp.org\/specifications\/\n                    \n                  . Accessed 10 May 2018"},{"issue":"10","key":"2749_CR17","doi-asserted-by":"publisher","first-page":"1400","DOI":"10.1016\/j.jpdc.2013.07.001","volume":"73","author":"LG Szafaryn","year":"2013","unstructured":"Szafaryn LG, Gamblin T, Supinski BRD, Skadron K (2013) Trellis: portability across architectures with a high-level framework. J Parallel Distrib Comput 73(10):1400\u20131413","journal-title":"J Parallel Distrib Comput"},{"issue":"12","key":"2749_CR18","doi-asserted-by":"publisher","first-page":"3202","DOI":"10.1016\/j.jpdc.2014.07.003","volume":"74","author":"EH Carter","year":"2014","unstructured":"Carter EH, Trott CR, Sunderland D (2014) Kokkos: enabling manycore performance portability through polymorphic memory access patterns. J Parallel Distrib Comput 74(12):3202\u20133216","journal-title":"J Parallel Distrib Comput"},{"key":"2749_CR19","doi-asserted-by":"crossref","unstructured":"Martineau M, Mcintosh-Smith S, Boulton M, Gaudin W (2016) An evaluation of emerging many-core parallel programming models. In: Proceedings of the 7th International Workshop on Programming Models and Applications for Multicores and Manycores, PMAM\u201916, pp 1\u201310","DOI":"10.1145\/2883404.2883420"},{"key":"2749_CR20","doi-asserted-by":"crossref","unstructured":"Lee S, Eigenmann R (2010) OpenMPC: extended OpenMP programming and tuning for GPUs. In: Proceedings of the 2010 ACM\/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis, SC\u201910, pp 1\u201311","DOI":"10.1109\/SC.2010.36"},{"issue":"3","key":"2749_CR21","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1016\/j.parco.2011.09.001","volume":"38","author":"A Kl\u00f6ckner","year":"2012","unstructured":"Kl\u00f6ckner A, Pinto N, Lee Y, Catanzaro B, Ivanov P, Fasih A (2012) PyCUDA and PyOpenCL: a scripting-based approach to GPU run-time code generation. Parallel Comput 38(3):157\u2013174","journal-title":"Parallel Comput"},{"key":"2749_CR22","doi-asserted-by":"crossref","unstructured":"Phothilimthana PM, Ansel J, Ragan-Kelley J, Amarasinghe S (2013) Portable performance on heterogeneous architectures. In: Proceedings of the Eighteenth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS\u201913), pp 431\u2013444","DOI":"10.1145\/2451116.2451162"},{"key":"2749_CR23","doi-asserted-by":"crossref","unstructured":"Chafi H, Sujeeth AK, Brown KJ, Lee HJ, Atreya AR, Olukotun K (2011) A domain-specific approach to heterogeneous parallelism. In: Proceedings of the 16th ACM Symposium on Principles and Practice of Parallel Programming (PPoPP\u201911), pp 35\u201346","DOI":"10.1145\/1941553.1941561"},{"key":"2749_CR24","doi-asserted-by":"crossref","unstructured":"Pu J, Bell S, Yang X, Setter J, Richardson S, Ragan-Kelley J, Horowitz M (2017) Programming heterogeneous systems from an image processing DSL. ACM Trans Archit Code Optim 14(3), Article 26","DOI":"10.1145\/3107953"},{"key":"2749_CR25","series-title":"Lecture Notes in Computer Science","volume-title":"Compiler construction, CC 2002, pp 179\u2013196","author":"W Thies","year":"2002","unstructured":"Thies W, Karczmarek M, Amarasinghe S (2002) StreamIt: a language for streaming applications. In: Horspool RN (ed) Compiler construction, CC 2002, pp 179\u2013196, vol 2304. Lecture Notes in Computer Science. Springer, Heidelberg"},{"issue":"3","key":"2749_CR26","doi-asserted-by":"publisher","first-page":"777","DOI":"10.1145\/1015706.1015800","volume":"23","author":"I Buck","year":"2004","unstructured":"Buck I, Foley T, Horn D, Sugerman J, Fatahalian K, Houston M, Hanrahan P (2004) Brook for GPUs: stream computing on graphics hardware. ACM Trans Graph 23(3):777\u2013786","journal-title":"ACM Trans Graph"},{"issue":"3","key":"2749_CR27","doi-asserted-by":"publisher","first-page":"381","DOI":"10.1145\/1961296.1950409","volume":"46","author":"AH Hormati","year":"2011","unstructured":"Hormati AH, Samadi M, Woh M, Mudge T, Mahlke S (2011) Sponge: portable stream programming on graphics engines. ACM SIGPLAN Not 46(3):381\u2013392","journal-title":"ACM SIGPLAN Not"},{"issue":"1","key":"2749_CR28","doi-asserted-by":"publisher","first-page":"118","DOI":"10.1007\/s11227-011-0656-7","volume":"61","author":"J Hong","year":"2012","unstructured":"Hong J, Hong K, Burgstaller B, Blieberger J (2012) StreamPI: a stream-parallel programming extension for object-oriented programming languages. J Supercomput 61(1):118\u2013140","journal-title":"J Supercomput"},{"issue":"10","key":"2749_CR29","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1145\/1932682.1869469","volume":"45","author":"J Auerbach","year":"2010","unstructured":"Auerbach J, Bacon DF, Cheng P, Rabbah R (2010) Lime: a Java-compatible and synthesizable language for heterogeneous architectures. ACM SIGPLAN Not 45(10):89\u2013108","journal-title":"ACM SIGPLAN Not"},{"key":"2749_CR30","doi-asserted-by":"crossref","unstructured":"Dubach C, Cheng P, Rabbah R, Bacon DF, Fink SJ (2012) Compiling a high-level language for GPUs: (via language support for architectures and compilers). In: Proceedings of the 33rd ACM SIGPLAN Conference on Programming Language Design and Implementation (PLDI\u201912), pp 1\u201312","DOI":"10.1145\/2254064.2254066"},{"issue":"1","key":"2749_CR31","doi-asserted-by":"publisher","first-page":"488","DOI":"10.1007\/s11227-014-1264-0","volume":"70","author":"Y Su","year":"2014","unstructured":"Su Y, Shi F, Talpur S, Wei J, Tan H (2014) Exploiting controlled-grained parallelism in message-driven stream programs. J Supercomput 70(1):488\u2013509","journal-title":"J Supercomput"},{"issue":"3","key":"2749_CR32","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1145\/1353536.1346318","volume":"43","author":"MD Linderman","year":"2008","unstructured":"Linderman MD, Collins JD, Wang H, Meng TH (2008) Merge: a programming model for heterogeneous multi-core systems. ACM SIGPLAN Not 43(3):287\u2013296","journal-title":"ACM SIGPLAN Not"},{"key":"2749_CR33","doi-asserted-by":"crossref","unstructured":"Enmyren J, Kessler CW (2010) SkePU: a multi-backend skeleton programming library for multi-GPU systems. In: Proceedings of the Fourth International Workshop on High-Level Parallel Programming and Applications (HLPP\u201910), pp 5\u201314","DOI":"10.1145\/1863482.1863487"},{"issue":"1","key":"2749_CR34","doi-asserted-by":"publisher","first-page":"62","DOI":"10.1007\/s10766-017-0490-5","volume":"46","author":"A Ernstsson","year":"2018","unstructured":"Ernstsson A, Li L, Kessler C (2018) SkePU 2: flexible and type-safe skeleton programming for heterogeneous parallel systems. Int J Parallel Program 46(1):62\u201380","journal-title":"Int J Parallel Program"},{"key":"2749_CR35","doi-asserted-by":"crossref","unstructured":"Steuwer M, Kegel P, Gorlatch S (2011) SkelCL: a portable skeleton library for high-level GPU programming. In: Proceedings of the 2011 IEEE International Symposium on Parallel and Distributed Processing Workshops and Phd Forum, pp 1176\u20131182","DOI":"10.1109\/IPDPS.2011.269"},{"key":"2749_CR36","doi-asserted-by":"crossref","unstructured":"Rodrigues C, Jablin T, Dakkak A, Hwu WM (2014) Triolet: a programming system that unifies algorithmic skeleton interfaces for high-performance cluster computing. In: Proceedings of the 19th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP\u201914), pp 247\u2013258","DOI":"10.1145\/2555243.2555268"},{"key":"2749_CR37","doi-asserted-by":"crossref","unstructured":"Steuwer M, Fensch C, Lindley S, Dubach C (2015) Generating performance portable code using rewrite rules: from high-level functional expressions to high-performance OpenCL code. In: Proceedings of the 20th ACM SIGPLAN International Conference on Functional Programming, ICFP 2015, pp 205\u2013217","DOI":"10.1145\/2784731.2784754"},{"key":"2749_CR38","doi-asserted-by":"crossref","unstructured":"Steuwer M, Remmelg T, Dubach C (2017) LIFT: A functional data-parallel IR for high-performance GPU code generation. In: Proceedings of the 2017 IEEE\/ACM International Symposium on Code Generation and Optimization, pp 74\u201385","DOI":"10.1109\/CGO.2017.7863730"},{"key":"2749_CR39","doi-asserted-by":"crossref","unstructured":"Collins A, Grewe D, Grover V, Lee S, Susnea A (2014) NOVA: a functional language for data parallelism. In: Proceedings of ACM SIGPLAN International Workshop on Libraries, Languages, and Compilers for Array Programming (ARRAY\u201914), pp 8\u201313","DOI":"10.1145\/2627373.2627375"},{"key":"2749_CR40","unstructured":"Henriksen T, Serup NGW, Elsman M, Henglein F, Oancea CE (2014) Futhark: purely functional gpu-programming with nested parallelism and in-place array updates. In: Proceedings of the 38th ACM SIGPLAN Conference on Programming Language Design and Implementation, PLDI\u201917, pp 556\u2013571"},{"key":"2749_CR41","volume-title":"Patterns for parallel programming","author":"T Mattson","year":"2004","unstructured":"Mattson T, Sanders B, Massingill B (2004) Patterns for parallel programming. Addison-Wesley Professional, Boston"},{"issue":"1","key":"2749_CR42","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1013208.1013209","volume":"36","author":"WM Johnston","year":"2004","unstructured":"Johnston WM, Hanna P Jr, Millar RJ (2004) Advances in dataflow programming languages. ACM Comput Surv 36(1):1\u201334","journal-title":"ACM Comput Surv"},{"key":"2749_CR43","volume-title":"Heterogeneous computing with OpenCL 2.0","author":"DR Kaeli","year":"2015","unstructured":"Kaeli DR, Mistry P, Schaa D, Zhang DP (2015) Heterogeneous computing with OpenCL 2.0. Morgan Kaufmann, San Francisco"},{"key":"2749_CR44","unstructured":"Stratton JA, Rodrigues C, Sung IJ, Obeid N, Chang LW, Anssari N, Liu GD, Hwu WW (2012) Parboil: a revised benchmark suite for scientific and commercial throughput computing. \n                    http:\/\/impact.crhc.illinois.edu\/Shared\/Docs\/impact-12-01.parboil.pdf\n                    \n                  . Accessed 10 May 2018"},{"key":"2749_CR45","unstructured":"The SPEC ACCEL benchmark. \n                    http:\/\/www.spec.org\/accel\/\n                    \n                  . Accessed 10 May 2018"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-019-02749-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-019-02749-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-019-02749-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,1,9]],"date-time":"2020-01-09T19:21:15Z","timestamp":1578597675000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-019-02749-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,1,10]]},"references-count":45,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2019,7]]}},"alternative-id":["2749"],"URL":"https:\/\/doi.org\/10.1007\/s11227-019-02749-1","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"type":"print","value":"0920-8542"},{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2019,1,10]]},"assertion":[{"value":"10 January 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}