{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,29]],"date-time":"2025-09-29T08:15:25Z","timestamp":1759133725568,"version":"3.33.0"},"reference-count":55,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2007,3,15]],"date-time":"2007-03-15T00:00:00Z","timestamp":1173916800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2007,3,15]],"date-time":"2007-03-15T00:00:00Z","timestamp":1173916800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Cluster Comput"],"published-print":{"date-parts":[[2007,6]]},"DOI":"10.1007\/s10586-007-0011-1","type":"journal-article","created":{"date-parts":[[2007,3,14]],"date-time":"2007-03-14T19:44:19Z","timestamp":1173901459000},"page":"115-126","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Performance portability on EARTH: a case study across several parallel architectures"],"prefix":"10.1007","volume":"10","author":[{"given":"Weirong","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Yanwei","family":"Niu","sequence":"additional","affiliation":[]},{"given":"Guang\u00a0R.","family":"Gao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2007,3,15]]},"reference":[{"unstructured":"The 23rd TOP500 Supercomputer list for June 2004: http:\/\/www.top500.org\/list\/2004\/06","key":"11_CR1"},{"unstructured":"Sun, Y., Bader, D.: Broadcast on clusters of SMPs with optimal concurrency. AHPCC Technical Report 2000-013, June 2000","key":"11_CR2"},{"doi-asserted-by":"crossref","unstructured":"Reussner, R., Hunzelmann, G.: Achieving performance portability with SKaMPI for high-performance MPI programs. In: ICCS \u201901: Proceedings of the International Conference on Computational Science\u2014Part II, pp.\u00a0841\u2013850. Springer-Verlag, London (2001)","key":"11_CR3","DOI":"10.1007\/3-540-45718-6_89"},{"key":"11_CR4","volume-title":"Designing and Building Parallel Programs: Concepts and Tools for Parallel Software Engineering","author":"I. Foster","year":"1995","unstructured":"Foster, I.: Designing and Building Parallel Programs: Concepts and Tools for Parallel Software Engineering. Addison-Wesley, Reading (1995)"},{"unstructured":"Borkar, S.Y., Mulder, H., Dubey, P., Pawlowski, S.S., Kahn, K.C., Rattner, J.R., Kuck, D.J.: Platform 2015: Intel processor and platform evolution for the next decade. ftp:\/\/download.intel.com\/technology\/computing\/archinnov\/platform2015\/, 2005","key":"11_CR5"},{"unstructured":"The CELL project at IBM Research: http:\/\/www.research.ibm.com\/cell\/","key":"11_CR6"},{"unstructured":"Goodarce, J.: Challenges in programming the multiprocessor platforms. In: 5th International Forum on Application-Specific Multi-Processor SoC, Saint-Maximin la Sainte Baume, France, July 2004","key":"11_CR7"},{"doi-asserted-by":"crossref","unstructured":"Dennis, J.B., Misunas, D.: A preliminary architecture for a basic data flow processor. In: Proceedings of the 2nd Annual International Symposium on Computer Architecture, 1974, pp. 126\u2013132","key":"11_CR8","DOI":"10.1145\/641675.642111"},{"issue":"2","key":"11_CR9","doi-asserted-by":"crossref","first-page":"42","DOI":"10.1109\/MC.1982.1653940","volume":"15","author":"Arvind","year":"1982","unstructured":"Arvind, Gostelow, K.P.: The U-interpreter. IEEE Comput. 15(2), 42\u201349 (1982)","journal-title":"IEEE Comput."},{"issue":"2","key":"11_CR10","doi-asserted-by":"crossref","first-page":"26","DOI":"10.1109\/MC.1982.1653939","volume":"15","author":"A.L. Davis","year":"1982","unstructured":"Davis, A.L., Keller, R.M.: Data flow progarm graphs. Comput. 15(2), 26\u201341 (1982)","journal-title":"Comput."},{"issue":"8","key":"11_CR11","doi-asserted-by":"crossref","first-page":"27","DOI":"10.1109\/2.303620","volume":"27","author":"B. Lee","year":"1994","unstructured":"Lee, B., Hurson, A.: Dataflow architectures and multithreading. IEEE Comput. 27(8), 27\u201339 (1994)","journal-title":"IEEE Comput."},{"issue":"13\u201314","key":"11_CR12","doi-asserted-by":"publisher","first-page":"1907","DOI":"10.1016\/S0167-8191(99)00070-8","volume":"25","author":"W.A. Najjar","year":"1999","unstructured":"Najjar, W.A., Lee, E.A., Gao, G.R.: Advances in the dataflow computational model. Parallel Comput. 25(13\u201314), 1907\u20131929 (1999)","journal-title":"Parallel Comput."},{"unstructured":"Theobald, K.B.: EARTH: An efficient architecture for running threads. Ph.D. dissertation, May 1999","key":"11_CR13"},{"unstructured":"Hum, H.H.J., Maquelin, O., Theobald, K.B., Tian, X., Tang, X., Gao, G.R.: A design study of the EARTH multiprocessor. In: Proceedings of the Conference on Parallel Architectures and Compilation Techniques (PACT), 1995, pp.\u00a059\u201368","key":"11_CR14"},{"doi-asserted-by":"crossref","unstructured":"Theobald, K.B., Agrawal, G., Kumar, R., Heber, G., Gao, G.R., Stodghill, P., Pingali, K.: Landing CG on EARTH: A case study of fine-grained multithreading on an evolutionary path. In: Proceedings of Supercomputing\u20192000, Nov. 2000","key":"11_CR15","DOI":"10.1109\/SC.2000.10011"},{"doi-asserted-by":"crossref","unstructured":"del Cuvillo, J., Tian, X., Gao, G.R., Girkar, M.: Performance study of a whole genome comparison tool on a hyper-threading multiprocessor. In: Fifth International Symposium on High Performance Computing, Tokyo, Japan, Oct. 2003","key":"11_CR16","DOI":"10.1007\/978-3-540-39707-6_40"},{"doi-asserted-by":"crossref","unstructured":"Zhu, W., Niu, Y., Lu, J., Shen, C., Gao, G.R.: A cluster-based solution for high performance hmmpfam using earth execution model. In: Proceedings of IEEE 5th International Conference on Cluster Computing (CLUSTER\u201903), Hong Kong, P.R. China, Dec. 2003, pp. 30\u201337","key":"11_CR17","DOI":"10.1109\/CLUSTR.2003.1253296"},{"unstructured":"Chen, F., Theobald, K.B., Gao, G.R.: Implementing parallel conjugate gradient on the EARTH multithreaded architecture. In: Proceedings of IEEE 6th International Conference on Cluster Computing (CLUSTER\u201904), San Diego, California, 20\u201323 Sept. 2004","key":"11_CR18"},{"unstructured":"Tremblay, G., Theobald, K.B., Morrone, C.J., Butala, M.D., Amaral, J.N., Gao, G.R.: Threaded-C language reference manual (release 2.0). CAPSL Technical Memo 39 (2000)","key":"11_CR19"},{"unstructured":"Shen, C.: A portable runtime system and its derivation for the hardware SU implementation. Master\u2019s thesis, Univ. of Delaware, Newark, DE, December 2003","key":"11_CR20"},{"unstructured":"Kakulavarapu, P., Maquelin, O., Gao, G.R.: Design of the runtime system for the portable Threaded-C language. CAPSL Technical Memo 24 (1998)","key":"11_CR21"},{"unstructured":"Morrone, C.J.: An EARTH runtime system for multi-processor\/multi-node Beowulf clusters. Master\u2019s thesis, Univ. of Delaware, Newark, DE, May 2001","key":"11_CR22"},{"unstructured":"Hum, H.H.J.: The super-actor machine: A hybrid dataflow\/von neuman architecture. Ph.D. dissertation, McGill University, Montreal, Canada, May 1992","key":"11_CR23"},{"unstructured":"The Argonne scalable cluster. http:\/\/www-unix.mcs.anl.gov\/chiba\/","key":"11_CR24"},{"unstructured":"The Argonne JAZZ cluster, laboratory computing resource center (lcrc). http:\/\/www.lcrc.anl.gov\/jazz\/","key":"11_CR25"},{"unstructured":"Bailey, D., Harris, T., Saphir, W., van\u00a0der Wijngaart, R., Woo, A., Yarrow, M.: The NAS parallel benchmarks 2.0. (1995)","key":"11_CR26"},{"unstructured":"HMMER: sequence analysis using profile hidden Markov models. http:\/\/hmmer.wustl.edu\/","key":"11_CR27"},{"unstructured":"Gao, G., Yates, R.: The argument-fetching dataflow architecture project: A status report. In: Can. Conf. on Elec. and Comp. Eng., Montreal, Sept. 1989","key":"11_CR28"},{"doi-asserted-by":"crossref","unstructured":"Sodan, A., Gao, G.R., Maquelin, O., Schultz, J.-U., Tian, X.-M.: Experiences with non-numeric applications on multithreaded architectures. In: Proceedings of the Sixth ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming (PPOPP97), 1997, pp.\u00a0124\u2013135","key":"11_CR29","DOI":"10.1145\/263764.263782"},{"key":"11_CR30","doi-asserted-by":"publisher","first-page":"176","DOI":"10.1145\/341800.341821","volume-title":"SPAA \u201900: Proceedings of the twelfth annual ACM symposium on Parallel algorithms and architectures","author":"P. Thulasiraman","year":"2000","unstructured":"Thulasiraman, P., Theobald, K.B., Khokhar, A.A., Gao, G.R.: Multithreaded algorithms for the fast Fourier transform. In: SPAA \u201900: Proceedings of the twelfth annual ACM symposium on Parallel algorithms and architectures, pp.\u00a0176\u2013185. ACM, New York (2000)"},{"key":"11_CR31","first-page":"18","volume-title":"IPDPS \u201901: Proceedings of the 15th International Parallel & Distributed Processing Symposium","author":"R.K. Thulasiram","year":"2001","unstructured":"Thulasiram, R.K., Litov, L., Nojumi, H., Downing, C.T., Gao, G.R.: Multithreaded algorithms for pricing a class of complex options. In: IPDPS \u201901: Proceedings of the 15th International Parallel & Distributed Processing Symposium, p.\u00a018. IEEE Computer Society, Washington, USA (2001)"},{"issue":"3","key":"11_CR32","doi-asserted-by":"publisher","first-page":"183","DOI":"10.1002\/cpe.604","volume":"14","author":"K.B. Theobald","year":"2002","unstructured":"Theobald, K.B., Kumar, R., Agrawal, G., Heber, G., Thulasiram, R.K., Gao, G.R.: Implementation and evaluation of a communication intensive application on the EARTH multithreaded system. Concurr. Comput. Pract. Experience 14(3), 183\u2013201 (2002)","journal-title":"Concurr. Comput. Pract. Experience"},{"issue":"1","key":"11_CR33","doi-asserted-by":"publisher","first-page":"68","DOI":"10.1016\/j.jpdc.2003.06.003","volume":"64","author":"P. Thulasiraman","year":"2004","unstructured":"Thulasiraman, P., Khokhar, A.A., Heber, G., Gao, G.R.: A fine-grain load-adaptive algorithm of the 2D discrete wavelet transform for multithreaded architectures. J. Parallel Distrib. Comput. 64(1), 68\u201378 (2004)","journal-title":"J. Parallel Distrib. Comput."},{"key":"11_CR34","volume-title":"Using MPI: Portable Parallel Programming with the Message-Passing Interface","author":"W. Gropp","year":"1994","unstructured":"Gropp, W., Lusk, E., Skjellum, A.: Using MPI: Portable Parallel Programming with the Message-Passing Interface. MIT Press, Cambridge, USA (1994)"},{"key":"11_CR35","volume-title":"Parallel Programming with MPI","author":"P. Pacheco","year":"1997","unstructured":"Pacheco, P.: Parallel Programming with MPI. Morgan Kaufmann, San Francisco (1997)"},{"doi-asserted-by":"crossref","unstructured":"Gropp, W.D., Lusk, E.: User\u2019s Guide for MPICH, a Portable Implementation of MPI. Mathematics and Computer Science Division, Argonne National Laboratory, aNL-96\/6 (1996)","key":"11_CR36","DOI":"10.2172\/378911"},{"key":"11_CR37","doi-asserted-by":"publisher","first-page":"381","DOI":"10.1145\/377792.377895","volume-title":"Proceedings of the 15th ACM International Conference on Supercomputing (ICS-01)","author":"H. Tang","year":"2001","unstructured":"Tang, H., Yang, T.: Optimizing threaded MPI execution on SMP clusters. In: Proceedings of the 15th ACM International Conference on Supercomputing (ICS-01), pp.\u00a0381\u2013392. ACM, New York (2001)"},{"doi-asserted-by":"crossref","unstructured":"Sistare, S., van de Vaart, R., Loh, E.: Optimization of MPI collectives on clusters of large-scale SMPs. In: Proceedings of Supercomputing 1999 (SC99). ACM and IEEE Computer Society Press, New York (1999)","key":"11_CR38","DOI":"10.1145\/331532.331555"},{"doi-asserted-by":"crossref","unstructured":"Takahashi, T., O\u2019Carroll, F., Tezuka, H., Hori, A., Sumimoto, S., Harada, H., Ishikawa, Y., Beckman, P.H.: Implementation and evaluation of MPI on an SMP cluster. In: Proceedings of the 11th IPPS\/SPDP\u201999 Workshops Held in Conjunction with the 13th International Parallel Processing Symposium and 10th Symposium on Parallel and Distributed Processing, pp.\u00a01178\u20131192. Springer-Verlag, London (1999)","key":"11_CR39","DOI":"10.1007\/BFb0098001"},{"unstructured":"TOMPI, a threads-only MPI implementation. http:\/\/theory.lcs.mit.edu\/~edemaine\/TOMPI\/","key":"11_CR40"},{"doi-asserted-by":"crossref","unstructured":"Geist, A., Beguelin, A., Dongarra, J., Jiang, W., Manchek, R., Sunderam, V., PVM: Parallel Virtual Machine\u2014A Users\u2019 Guide and Tutorial for Networked Parallel Computing. MIT Press, Cambridge (1994)","key":"11_CR41","DOI":"10.7551\/mitpress\/5712.001.0001"},{"unstructured":"Santos, C., Aude, J.: PM-PVM: A portable multithreaded PVM. In: Proceedings of 13th International Parallel Processing Symposium and 10th Symposium on Parallel and Distributed Processing, San Juan, Puerto Rico, 12\u201316 April, 1999","key":"11_CR42"},{"issue":"5","key":"11_CR43","doi-asserted-by":"publisher","first-page":"407","DOI":"10.1002\/(SICI)1096-9128(19980425)10:5<407::AID-CPE326>3.0.CO;2-6","volume":"10","author":"H. Zhou","year":"1998","unstructured":"Zhou, H., Geist, A.: LPVM: a step towards multithread PVM. Concurr. Pract. Experience 10(5), 407\u2013416 (1998)","journal-title":"Concurr. Pract. Experience"},{"issue":"3","key":"11_CR44","doi-asserted-by":"publisher","first-page":"199","DOI":"10.1002\/(SICI)1096-9128(199803)10:3<199::AID-CPE295>3.0.CO;2-H","volume":"10","author":"A. Ferrari","year":"1998","unstructured":"Ferrari, A., Sunderam, V.: Multiparadigm Distributed Computing with TPVM. Concurr. Pract. Experience 10(3), 199\u2013228 (1998)","journal-title":"Concurr. Pract. Experience"},{"unstructured":"Chandra, R., Menon, R., Dagum, L., Kohr, D., Maydan, D., McDonald, J.: Parallel Programming in OpenMP. Morgan Kaufmann, San Mateo (2000)","key":"11_CR45"},{"unstructured":"Lu, H., Hu, Y.C., Zwaenepoel, W.: OpenMP on network of workstations. In: Proceedings of Supercomputing\u201998, Oct. 1998","key":"11_CR46"},{"key":"11_CR47","volume-title":"Proceedings of Supercomputing 2003 (SC2003)","author":"Y.-S. Kee","year":"2003","unstructured":"Kee, Y.-S., Kim, J.-S., Ha, S.: ParADE: An OpenMP programming environment for SMP cluster systems. In: Proceedings of Supercomputing 2003 (SC2003). ACM, Phoenix (2003)"},{"doi-asserted-by":"crossref","unstructured":"Ojima, Y., Sato, M., Harada, H., Ishikawa, Y.: Performance of cluster-enabled OpenMP for the SCASH software distributed shared memory system. In: Proceedings of the 3rd IEEE\/ACM Int\u2019l Symp. on Cluster Computing and the Grid (CCGrid\u201903), May 2003, pp. 450\u2013456","key":"11_CR48","DOI":"10.1109\/CCGRID.2003.1199400"},{"unstructured":"Butenhof, D.R.: Programming with POSIX(R) Threads. Addison-Wesley, Reading (1997)","key":"11_CR49"},{"doi-asserted-by":"crossref","unstructured":"L\u00f6f, H., Radovic, Z., Hagersten, E.: THROOM\u2014running POSIX multithreaded binaries on a cluster. Department of Information Technology, Uppsala University, Tech. Rep. 2003-026, Apr. 2003","key":"11_CR50","DOI":"10.1007\/978-3-540-45209-6_105"},{"doi-asserted-by":"crossref","unstructured":"Jamieson, P., Bilas, A.: CableS: Thread control and memory system extensions for shared virtual memory clusters, In: Lecture Notes in Computer Science, vol.\u00a02104 (2001)","key":"11_CR51","DOI":"10.1007\/3-540-44587-0_15"},{"issue":"2\u20133","key":"11_CR52","first-page":"83","volume":"9","author":"L. Smith","year":"2001","unstructured":"Smith, L., Bull, M.: Development of mixed mode MPI\/openMP applications. Sci. Program. 9(2\u20133), 83\u201398 (2001)","journal-title":"Sci. Program."},{"key":"11_CR53","volume-title":"Proceedings of Supercomputing\u20192000","author":"F. Cappello","year":"2000","unstructured":"Cappello, F., Etiemble, D.: MPI versus MPI+openMP on IBM SP for the NAS benchmarks. In: Proceedings of Supercomputing\u20192000. IEEE and ACM SIGARCH, Dallas (2000)"},{"unstructured":"Jost, G., Jin, H., an\u00a0Mey, D., Hatay, F.F.: Comparing the OpenMP, MPI, and hybrid programming paradigms on an SMP cluster. In: Proceedings of the Fifth European Workshop on OpenMP (EWOMP03), Aachen, Germany, September 2003","key":"11_CR54"},{"unstructured":"Rebenseifner, R.: Hybrid parallel programming: Performance problems and chances. In: Proceedings of the 45th Cray User Group Conference, Ohio, 12\u201316 May 2003","key":"11_CR55"}],"container-title":["Cluster Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-007-0011-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10586-007-0011-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-007-0011-1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-007-0011-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,14]],"date-time":"2025-01-14T23:12:57Z","timestamp":1736896377000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10586-007-0011-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2007,3,15]]},"references-count":55,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2007,6]]}},"alternative-id":["11"],"URL":"https:\/\/doi.org\/10.1007\/s10586-007-0011-1","relation":{},"ISSN":["1386-7857","1573-7543"],"issn-type":[{"type":"print","value":"1386-7857"},{"type":"electronic","value":"1573-7543"}],"subject":[],"published":{"date-parts":[[2007,3,15]]},"assertion":[{"value":"15 March 2007","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}