{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T08:44:58Z","timestamp":1773305098973,"version":"3.50.1"},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2014,3,19]],"date-time":"2014-03-19T00:00:00Z","timestamp":1395187200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Cluster Comput"],"published-print":{"date-parts":[[2014,12]]},"DOI":"10.1007\/s10586-014-0361-4","type":"journal-article","created":{"date-parts":[[2014,3,18]],"date-time":"2014-03-18T10:14:12Z","timestamp":1395137652000},"page":"1139-1155","source":"Crossref","is-referenced-by-count":27,"title":["Improved MPI collectives for MPI processes in shared address spaces"],"prefix":"10.1007","volume":"17","author":[{"given":"Shigang","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Torsten","family":"Hoefler","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chungjin","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Marc","family":"Snir","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2014,3,19]]},"reference":[{"key":"361_CR1","unstructured":"AMD: Software optimization guide for AMD family 15h processors (2012)."},{"key":"361_CR2","unstructured":"Aulwes, R., Daniel, D., Desai, N., Graham, R., Risinger, L., Taylor, M., Woodall, T., Sukalski, M.: Architecture of LA-MPI, a network-fault-tolerant MPI. In: Proceedings of the 18th International Parallel and Distributed Processing Symposium, p. 15 (2004)."},{"key":"361_CR3","doi-asserted-by":"crossref","unstructured":"Blagojevi\u0107, F., Hargrove, P., Iancu, C., Yelick, K.: Hybrid PGAS runtime support for multicore nodes. In: Proceedings of the Fourth Conference on Partitioned Global Address Space Programming Model PGAS \u201910, pp. 3:1\u20133:10. ACM (2010).","DOI":"10.1145\/2020373.2020376"},{"key":"361_CR4","unstructured":"Broquedis, F., Clet-Ortega, J., Moreaud, S., Furmento, N., Goglin, B., Mercier, G., Thibault, S., Namyst, R.: hwloc: A generic framework for managing hardware affinities in HPC applications. In: Proceedings of the 18th Euromicro Conference on Parallel, Distributed and Network-Based Processing PDP \u201910, pp. 180\u2013186. IEEE Computer Society (2010)."},{"key":"361_CR5","doi-asserted-by":"crossref","unstructured":"Feind, K., McMahon, K.: An ultrahigh performance MPI implementation on SGI ccNUMA Altix systems. Comput. Methods Sci. Technol., 67\u201370 (2006).","DOI":"10.12921\/cmst.2006.SI.01.67-70"},{"key":"361_CR6","unstructured":"Friedley, A., Bronevetsky, G., Lumsdaine, A., Hoefler, T.: Hybrid MPI: efficient message passing for multi-core systems. In: Proceedings of the SC13 IEEE\/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (2013)."},{"key":"361_CR7","doi-asserted-by":"crossref","unstructured":"Friedley, A., Hoefler, T., Bronevetsky, G., Lumsdaine, A., Ma, C.C.: Ownership passing: efficient distributed memory programming on multi-core systems. In: Proceedings of the 18th ACM symposium on Principles and Practice of Parallel Programming PPoPP\u201913 (Accepted) (2013).","DOI":"10.1145\/2442516.2442534"},{"key":"361_CR8","unstructured":"Graham, R.L., Shipman, G.: MPI support for multi-core architectures: optimized shared memory collectives. In: Proceedings of the 15th European PVM\/MPI Users\u2019 Group Meeting on Recent Advances in Parallel Virtual Machine and Message Passing Interface, pp. 130\u2013140. Springer, Berlin (2008)."},{"issue":"1","key":"361_CR9","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/BF01379320","volume":"17","author":"D Hensgen","year":"1988","unstructured":"Hensgen, D., Finkel, R., Manber, U.: Two algorithms for barrier synchronization. Int. J. Parallel Program. 17(1), 1\u201317 (1988)","journal-title":"Int. J. Parallel Program."},{"key":"361_CR10","doi-asserted-by":"crossref","unstructured":"Hoefler, T., Mehlan, T., Mietke, F., Rehm, W.: Fast barrier synchronization for InfiniBand. In: Proceedings of the 20th International Parallel and Distributed Processing Symposium IPDPS (2006).","DOI":"10.1109\/IPDPS.2006.1639561"},{"key":"361_CR11","unstructured":"Intel: Intel 64 and IA-32 Architectures optimization reference manual (2012)."},{"key":"361_CR12","unstructured":"Kamal, H., Wagner, A.: Fg-mpi: fine-grain mpi for multicore and clusters. In: Proceedings of the IEEE International Symposium on Parallel Distributed Processing, Workshops and Phd Forum (IPDPSW), pp. 1\u20138 (2010)."},{"key":"361_CR13","doi-asserted-by":"crossref","unstructured":"Kielmann, T., Hofman, R.F.H., Bal, H.E., Plaat, A., Bhoedjang, R.A.F.: MagPIe: MPI\u2019s collective communication operations for clustered wide area systems. In: Proceedings of the seventh ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming PPoPP \u201999, pp. 131\u2013140. ACM, New York (1999)","DOI":"10.1145\/301104.301116"},{"key":"361_CR14","doi-asserted-by":"crossref","unstructured":"Li, S., Hoefler, T., Snir, M.: Numa-aware shared-memory collective communication for mpi. In: Proceedings of the 22nd International Symposium on High-Performance Parallel and Distributed Computing HPDC \u201913, pp. 85\u201396. ACM, New York (2013)","DOI":"10.1145\/2493123.2462903"},{"key":"361_CR15","doi-asserted-by":"crossref","unstructured":"Mamidala, A., Kumar, R., De, D., Panda, D.: MPI collectives on modern multicore clusters: performance optimizations and communication characteristics. In: Proceedings of the 8th IEEE International Symposium on Cluster Computing and the Grid CCGRID \u201908, pp. 130\u2013137 (2008).","DOI":"10.1109\/CCGRID.2008.87"},{"issue":"4","key":"361_CR16","doi-asserted-by":"crossref","first-page":"269","DOI":"10.1145\/106973.106999","volume":"26","author":"JM Mellor-Crummey","year":"1991","unstructured":"Mellor-Crummey, J.M., Scott, M.L.: Synchronization without contention. SIGPLAN Notice 26(4), 269\u2013278 (1991)","journal-title":"SIGPLAN Notice"},{"key":"361_CR17","doi-asserted-by":"crossref","unstructured":"Molka, D., Hackenberg, D., Schone, R., Muller, M.S.: Memory performance and cache coherency effects on an Intel Nehalem multiprocessor system. In: Proceedings of the 18th International Conference on Parallel Architectures and Compilation Techniques PACT \u201909, pp. 261\u2013270. IEEE Computer Society, Washington (2009)","DOI":"10.1109\/PACT.2009.22"},{"key":"361_CR18","unstructured":"MPI Forum: MPI: A Message-passing interface standard. version 2.2 (2009)."},{"key":"361_CR19","doi-asserted-by":"crossref","unstructured":"Negara, S., Zheng, G., Pan, K.C., Negara, N., Johnson, R.E., Kal\u00e9, L.V., Ricker, P.M.: Automatic MPI to AMPI program transformation using photran. In: Proceedings of the Conference on Parallel Processing Euro-Par, pp. 531\u2013539. Springer, Berlin (2011)","DOI":"10.1007\/978-3-642-21878-1_65"},{"key":"361_CR20","unstructured":"Board, OpenMP Architecture Review: Application program interface version 3, 1 (2011)"},{"key":"361_CR21","unstructured":"P\u00e9rache, M., Carribault, P., Jourdren, H.: MPC-MPI: an MPI implementation reducing the overall memory consumption. In: Proceedings of the 16th European PVM\/MPI Users\u2019 Group Meeting on Recent Advances in Parallel Virtual Machine and Message Passing Interface, pp. 94\u2013103. Springer, Berlin (2009)."},{"key":"361_CR22","doi-asserted-by":"crossref","unstructured":"Rabenseifner, R.: Optimization of collective reduction operations. Computational Science-ICCS pp. 1\u20139. Springer, Berlin (2004).","DOI":"10.1007\/978-3-540-24685-5_1"},{"key":"361_CR23","doi-asserted-by":"crossref","unstructured":"Sistare, S., Vaart, R., Loh, E.: Optimization of MPI collectives on clusters of large-scale SMP\u2019s. In: Proceedings of the ACM\/IEEE 1999 Conference on Supercomputing (1999).","DOI":"10.1145\/331532.331555"},{"issue":"4","key":"361_CR24","doi-asserted-by":"crossref","first-page":"673","DOI":"10.1145\/363911.363920","volume":"22","author":"H Tang","year":"2000","unstructured":"Tang, H., Shen, K., Yang, T.: Program transformation and runtime support for threaded MPI execution on shared-memory machines. ACM Trans. Program. Lang. Syst. (TOPLAS) 22(4), 673\u2013700 (2000)","journal-title":"ACM Trans. Program. Lang. Syst. (TOPLAS)"},{"key":"361_CR25","doi-asserted-by":"crossref","unstructured":"Tang, H., Yang, T.: Optimizing threaded MPI execution on SMP clusters. In: Proceedings of the 15th International Conference on Supercomputing ICS \u201901, pp. 381\u2013392. ACM (2001).","DOI":"10.1145\/377792.377895"},{"key":"361_CR26","unstructured":"Thakur, R., Gropp, W.: Improving the performance of collective operations in MPICH. In: Proceedings of the 10th European PVM\/MPI User\u2019s Group Meeting in Recent Advances in Parallel Virtual Machine and Message Passing Interface. Lecture Notes in Computer Science, vol. 2840, pp. 257\u2013267. Springer, Berlin (2003)."},{"key":"361_CR27","doi-asserted-by":"crossref","first-page":"49","DOI":"10.1177\/1094342005051521","volume":"19","author":"R Thakur","year":"2005","unstructured":"Thakur, R., Rabenseifner, R., Gropp, W.: Optimization of collective communication operations in MPICH. Int. J. High Perform. Comput. Appl. 19, 49\u201366 (2005)","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"361_CR28","doi-asserted-by":"crossref","unstructured":"Tipparaju, V., Nieplocha, J., Panda, D.: Fast collective operations using shared and remote memory access protocols on clusters. In: Proceedings of the International IEEE on the Parallel and Distributed Processing Symposium, (2003).","DOI":"10.1109\/IPDPS.2003.1213188"},{"issue":"4","key":"361_CR29","first-page":"388","volume":"36","author":"PC Yew","year":"1987","unstructured":"Yew, P.C., Tzeng, N.F., Lawrie, D.: Distributing hot-spot addressing in large-scale multiprocessors. IEEE Trans. Comput. 36(4), 388\u2013395 (1987)","journal-title":"IEEE Trans. Comput."},{"key":"361_CR30","doi-asserted-by":"crossref","unstructured":"Zhang, E.Z., Jiang, Y., Shen, X.: Does cache sharing on modern CMP matter to the performance of contemporary multithreaded programs? In: Proceedings of the 15th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming PPoPP \u201910, pp. 203\u2013212. ACM (2010).","DOI":"10.1145\/1693453.1693482"},{"key":"361_CR31","unstructured":"Zhang, J., Behzad, B., Snir, M.: Optimizing the BarnesspsHut algorithm in UPC. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis SC \u201911, pp. 75:1\u201375:11. ACM (2011)."},{"key":"361_CR32","unstructured":"Zhu, H., Goodell, D., Gropp, W., Thakur, R.: Hierarchical collectives in MPICH2. In: Proceedings of the 16th European PVM\/MPI Users\u2019 Group Meeting on Recent Advances in Parallel Virtual Machine and Message Passing Interface, pp. 325\u2013326. Springer, Berlin (2009)."}],"container-title":["Cluster Computing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-014-0361-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10586-014-0361-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-014-0361-4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,8,8]],"date-time":"2019-08-08T11:40:02Z","timestamp":1565264402000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10586-014-0361-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014,3,19]]},"references-count":32,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2014,12]]}},"alternative-id":["361"],"URL":"https:\/\/doi.org\/10.1007\/s10586-014-0361-4","relation":{},"ISSN":["1386-7857","1573-7543"],"issn-type":[{"value":"1386-7857","type":"print"},{"value":"1573-7543","type":"electronic"}],"subject":[],"published":{"date-parts":[[2014,3,19]]}}}