{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T12:15:22Z","timestamp":1763468122580},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2013,3,20]],"date-time":"2013-03-20T00:00:00Z","timestamp":1363737600000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2013,10]]},"DOI":"10.1007\/s11227-013-0912-0","type":"journal-article","created":{"date-parts":[[2013,3,19]],"date-time":"2013-03-19T15:26:41Z","timestamp":1363706801000},"page":"381-405","source":"Crossref","is-referenced-by-count":9,"title":["A compound OpenMP\/MPI program development toolkit for hybrid CPU\/GPU clusters"],"prefix":"10.1007","volume":"66","author":[{"given":"Hung-Fu","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tyng-Yeu","family":"Liang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jun-Yao","family":"Chiu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2013,3,20]]},"reference":[{"issue":"1","key":"912_CR1","doi-asserted-by":"crossref","first-page":"80","DOI":"10.1111\/j.1467-8659.2007.01012.x","volume":"26","author":"JD Owens","year":"2007","unstructured":"Owens JD, Luebke D, Govindaraju N, Harris M, Kr\u00fcger J, Lefohn AE, Purcell T (2007) A survey of general purpose computation on graphics hardware. Comput Graph Forum 26(1):80\u2013113","journal-title":"Comput Graph Forum"},{"key":"912_CR2","unstructured":"Top500 list, Nov 2012, Referenced from http:\/\/www.top500.org"},{"key":"912_CR3","unstructured":"Titan supercomputer, referenced from http:\/\/www.olcf.ornl.gov\/titan\/"},{"issue":"3","key":"912_CR4","doi-asserted-by":"crossref","first-page":"344","DOI":"10.1007\/s02011-011-1137-8","volume":"26","author":"X-J Yang","year":"2011","unstructured":"Yang X-J, Liao X-K, Lu K, Hu Q-F, Song J-Q, Su J-S (2011) The TianHe-1A supercomputer: its hardware and software. J Comput Sci Technol 26(3):344\u2013351","journal-title":"J Comput Sci Technol"},{"key":"912_CR5","doi-asserted-by":"crossref","first-page":"4","DOI":"10.1109\/I-SPAN.2009.150","volume-title":"Proceedings of 10th international symposium on pervasive systems, algorithms, and networks","author":"C Vecchiola","year":"2009","unstructured":"Vecchiola C, Pandey S, Buyya R (2009) High-performance cloud computing: a view of scientific applications. In: Proceedings of 10th international symposium on pervasive systems, algorithms, and networks, pp 4\u201316"},{"key":"912_CR6","doi-asserted-by":"crossref","first-page":"789","DOI":"10.1016\/0167-8191(96)00024-5","volume":"22","author":"W Gropp","year":"1996","unstructured":"Gropp W, Lusk E, Doss N, Skjellum A (1996) A high-performance, portable implementation of the MPI message passing interface standard. Parallel Comput 22:789\u2013828","journal-title":"Parallel Comput"},{"key":"912_CR7","unstructured":"The OpenMP Forum (1998) OpenMP C and C++ application program interface, version 1.0. http:\/\/www.openmp.org"},{"key":"912_CR8","unstructured":"NVIDIA CUDA programming guide version 2.1.1. http:\/\/www.nvidia.com.tw\/object\/cuda_develop_tw_old.html"},{"issue":"3","key":"912_CR9","doi-asserted-by":"crossref","first-page":"66","DOI":"10.1109\/MCSE.2010.69","volume":"12","author":"JE Stone","year":"2010","unstructured":"Stone JE, Gohara D, Shi G (2010) OpenCL: a parallel programming standard for heterogeneous computing systems. Comput Sci Eng 12(3):66\u201373","journal-title":"Comput Sci Eng"},{"issue":"2","key":"912_CR10","doi-asserted-by":"crossref","first-page":"18","DOI":"10.1109\/2.485843","volume":"29","author":"C Amza","year":"1996","unstructured":"Amza C, Cox AL, Dwarkadas H, Keleher P, Lu H, Rajamony R, Yu W, Zwaenepoel W (1996) TreadMarks: shared memory computing on networks of workstations. IEEE Comput 29(2):18\u201328","journal-title":"IEEE Comput"},{"key":"912_CR11","first-page":"273","volume":"2","author":"C Clark","year":"2005","unstructured":"Clark C, Fraser K, Hand SM, Hansen JG, Jul EB, Limpach C, Pratt IA, Warfield A (2005) Live migration of virtual machines. Proceedings of the 2nd Conference on Symposium on Networked Systems Design and Implementation 2:273\u2013286","journal-title":"Proceedings of the 2nd Conference on Symposium on Networked Systems Design and Implementation"},{"key":"912_CR12","series-title":"Lecture notes in computer science","first-page":"457","volume-title":"Proceedings of WOMPEI\u201902","author":"A Basumallik","year":"2012","unstructured":"Basumallik A, Min S-j, Eigenmann R (2012) Towards OpenMP execution on software distributed shared memory systems. In: Proceedings of WOMPEI\u201902. Lecture notes in computer science, vol\u00a02327, pp 457\u2013468"},{"key":"912_CR13","unstructured":"Microsoft, \u201cHLSL for DirectX\u201d. http:\/\/msdn.microsoft.com\/en-us\/library\/windows\/desktop\/bb509561.aspx"},{"key":"912_CR14","unstructured":"Kessenich J, Baldwin D, Rost R (2011) The OpenGL shader language"},{"key":"912_CR15","isbn-type":"print","volume-title":"The Cg tutorial: the definitive guide to programmable real-time graphics","author":"R Fernando","year":"2003","unstructured":"Fernando R, Kilgard MJ (2003) The Cg tutorial: the definitive guide to programmable real-time graphics. Addison-Wesley Professional, Reading. ISBN 0-321-19496-9","ISBN":"http:\/\/id.crossref.org\/isbn\/0321194969"},{"key":"912_CR16","series-title":"Lecture notes in computer science","doi-asserted-by":"crossref","first-page":"887","DOI":"10.1007\/978-3-642-03869-3_82","volume-title":"Euro-par 2009 parallel processing","author":"Y Yan","year":"2009","unstructured":"Yan Y, Grossman M, Sarkar V (2009) JCUDA: a programmer-friendly interface for accelerating Java programs with CUDA. In: Euro-par 2009 parallel processing. Lecture notes in computer science, vol 5704, pp 887\u2013899"},{"key":"912_CR17","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1145\/1808954.1808959","volume-title":"Proceeding of the 3rd international workshops on multicore software engineering","author":"G Dotzler","year":"2010","unstructured":"Dotzler G, Veldema R, Klemm M (2010) JCudaMP:OpenMP\/Java on CUDA. In: Proceeding of the 3rd international workshops on multicore software engineering, pp 10\u201317"},{"key":"912_CR18","first-page":"26","volume-title":"Proceeding of 2009 1st international conference on information science and engineering","author":"Q-k Chen","year":"2009","unstructured":"Chen Q-k, Zhang J-k (2009) A stream processor cluster architecture model with the hybrid technology of MPI and CUDA. In: Proceeding of 2009 1st international conference on information science and engineering, pp 26\u201328"},{"issue":"1","key":"912_CR19","doi-asserted-by":"crossref","first-page":"78","DOI":"10.1109\/TPDS.2010.62","volume":"22","author":"TD Han","year":"2011","unstructured":"Han TD, Abdelrahman TS (2011) hiCUDA: high-level GPGPU programming. IEEE Trans Parallel Distrib Syst 22(1):78\u201390","journal-title":"IEEE Trans Parallel Distrib Syst"},{"key":"912_CR20","first-page":"512","volume-title":"IEEE 13th international conference on high performance computing and communications (HPCC)","author":"G Noaje","year":"2011","unstructured":"Noaje G, Jaillet C, Krajecki M (2011) Source-to-source code translator: OpenMP C to CUDA. In: IEEE 13th international conference on high performance computing and communications (HPCC), pp 512\u2013519"},{"key":"912_CR21","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/SC.2010.36","volume-title":"2010 international conference for high performance computing, networking, storage and analysis (SC)","author":"S Lee","year":"2010","unstructured":"Lee S, Eigenmann R (2010) OpenMPC: extended OpenMP programming and tuning for GPUs. In: 2010 international conference for high performance computing, networking, storage and analysis (SC), pp 1\u201311"},{"key":"912_CR22","doi-asserted-by":"crossref","first-page":"260","DOI":"10.1145\/1454115.1454152","volume-title":"Proceedings of the 17th international conference on parallel architectures and compilation techniques","author":"B He","year":"2008","unstructured":"He B, Fang W, Luo Q, Govindaraju NK, Wang T (2008) Mars: a MapReduce framework on graphics processors. In: Proceedings of the 17th international conference on parallel architectures and compilation techniques, pp 260\u2013269"},{"key":"912_CR23","volume-title":"The proceedings of the workshop on general purpose processing on graphics processing units (GPGPU 2007)","author":"R Dolbeau","year":"2007","unstructured":"Dolbeau R, Bihan S, Bodin F (2007) HMPP: a hybrid multi-core parallel programming environment. In: The proceedings of the workshop on general purpose processing on graphics processing units (GPGPU 2007)"},{"key":"912_CR24","unstructured":"Tsai T-C (2010) OMP2OCL translator: a translator for automatic translation of OpenMP programs into OpenCL programs. Mater Thesis, Institute of Computer Science and Engineering, National Chiao-Tung University"},{"issue":"1","key":"912_CR25","doi-asserted-by":"crossref","first-page":"107","DOI":"10.1145\/1327452.1327492","volume":"51","author":"J Dean","year":"2008","unstructured":"Dean J, Ghmawat S (2008) MapReduce: simplified data processing on large clusters. Commun ACM 51(1):107\u2013113","journal-title":"Commun ACM"},{"key":"912_CR26","unstructured":"OpenACC API 1.0 (2012) http:\/\/www.openacc-standard.org\/Downloads\/"},{"key":"912_CR27","first-page":"2369","volume-title":"IPDPS 2102","author":"T-Y Liang","year":"2012","unstructured":"Liang T-Y, Li H-F, Chiu J-Y (2012) Enabling mixed OpenMP\/MPI programming on hybrid CPU\/GPU computing architecture. In: IPDPS 2102, pp 2369\u20132377"},{"issue":"2\/3","key":"912_CR28","doi-asserted-by":"crossref","first-page":"97","DOI":"10.1504\/IJGUC.2012.047760","volume":"3","author":"T-Y Liang","year":"2012","unstructured":"Liang T-Y, Chang Y-W, Li H-F (2012) A CUDA programming toolkit on grids. Int J Grid Util Comput 3(2\/3):97\u2013111","journal-title":"Int J Grid Util Comput"},{"key":"912_CR29","first-page":"225","volume-title":"Proceedings of the Linux symposium","author":"A Kivity","year":"2007","unstructured":"Kivity A, Kamay Y, Laor D, Lublin U, Liguori A (2007) KVM: Linux virtual machine monitor. In: Proceedings of the Linux symposium, vol 1, pp 225\u2013230"},{"key":"912_CR30","first-page":"209","volume-title":"Third international conference on intelligent networking and collaborative systems","author":"H-F Li","year":"2011","unstructured":"Li H-F, Liang T-Y, Jiang J-L (2011) An OpenMP compiler for hybrid CPU\/GPU computing architecture. In: Third international conference on intelligent networking and collaborative systems, pp 209\u2013216"},{"key":"912_CR31","series-title":"Lecture notes in computer science","doi-asserted-by":"crossref","first-page":"20","DOI":"10.1007\/3-540-44587-0_3","volume-title":"OpenMP shared memory parallel programming","author":"K Kusano","year":"2001","unstructured":"Kusano K, Sato M, Hosomi T, Seo Y (2001) The omni OpenMP compiler on the distributed shared memory of Cenju-4. In: OpenMP shared memory parallel programming. Lecture notes in computer science, vol 2104, pp 20\u201330"},{"key":"912_CR32","doi-asserted-by":"crossref","first-page":"101","DOI":"10.1145\/1504176.1504194","volume-title":"Proceedings of the 14th ACM SIGPLAN symposium on principles and practice of parallel programming","author":"S Lee","year":"2009","unstructured":"Lee S, Min S-J, Eigenmann R (2009) OpenMP to GPGPU: a compiler framework for automatic translation and optimization. In: Proceedings of the 14th ACM SIGPLAN symposium on principles and practice of parallel programming, pp 101\u2013110"},{"key":"912_CR33","doi-asserted-by":"crossref","unstructured":"Conway ME (1963) Design of a separable transition-diagram compiler. Commun ACM, 396\u2013408","DOI":"10.1145\/366663.366704"},{"key":"912_CR34","unstructured":"NVIDIA Development Zone (2012) CUDA C best practices guide, pp 51\u201352. http:\/\/developer.nvidia.com\/cuda\/nvidia-gpu-computing-documentation"},{"key":"912_CR35","doi-asserted-by":"crossref","first-page":"253","DOI":"10.1145\/1088149.1088183","volume-title":"Proceedings of 19th annual international conference on supercomputing","author":"G Alm\u00e1si","year":"2005","unstructured":"Alm\u00e1si G, Heidelberger P, Archer CJ, Martorell X, Erway CC, Moreira JE, Steinmacher-Burow B, Zheng Y (2005) Optimization of MPI collective communication on BlueGene\/L systems. In: Proceedings of 19th annual international conference on supercomputing, pp 253\u2013262"},{"key":"912_CR36","volume-title":"Proceedings of the 2000 ACM\/IEEE conference on supercomputing","author":"S Vadhiyar","year":"2000","unstructured":"Vadhiyar S, Fagg G, Dongarra J (2000) Automatically tuned collective communications. In: Proceedings of the 2000 ACM\/IEEE conference on supercomputing"},{"key":"912_CR37","doi-asserted-by":"crossref","first-page":"195","DOI":"10.1109\/ICPP.2004.1327921","volume-title":"International conference on parallel processing 2004","author":"J Corbalan","year":"2004","unstructured":"Corbalan J, Duran A, Labarta J (2004) Dynamic load balancing of MPI+OpenMP applications. In: International conference on parallel processing 2004, vol 1, pp 195\u2013202"},{"key":"912_CR38","doi-asserted-by":"crossref","first-page":"161","DOI":"10.1109\/ClusterW.2012.20","volume-title":"2012 IEEE international conference on cluster computing workshops","author":"K Zhang","year":"2012","unstructured":"Zhang K, Wu B (2012) Task scheduling for GPU heterogeneous cluster. In: 2012 IEEE international conference on cluster computing workshops, pp 161\u2013169"},{"key":"912_CR39","doi-asserted-by":"crossref","first-page":"95","DOI":"10.1109\/NISS.2009.171","volume-title":"2009 international conference on new trends in information and service science","author":"S Nian","year":"2009","unstructured":"Nian S, Guangmin L (2009) Dynamic load balancing algorithm for MPI parallel computing. In: 2009 international conference on new trends in information and service science, pp 95\u201399"},{"key":"912_CR40","first-page":"64","volume-title":"Proceedings of 15th Euro PVM\/MPI","author":"I Galindo","year":"2008","unstructured":"Galindo I, Almeida F (2008) Dynamic load balancing on dedicated heterogeneous systems. In: Proceedings of 15th Euro PVM\/MPI, pp 64\u201374"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-013-0912-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-013-0912-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-013-0912-0","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,6,30]],"date-time":"2023-06-30T11:04:50Z","timestamp":1688123090000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-013-0912-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2013,3,20]]},"references-count":40,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2013,10]]}},"alternative-id":["912"],"URL":"https:\/\/doi.org\/10.1007\/s11227-013-0912-0","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2013,3,20]]}}}