diff --git a/packages/mpi/openmpi/openmpi-2.0.0 b/packages/mpi/openmpi/openmpi-2.0.0 index 194ead9..39ef618 100755 --- a/packages/mpi/openmpi/openmpi-2.0.0 +++ b/packages/mpi/openmpi/openmpi-2.0.0 @@ -44,6 +44,10 @@ if [[ -d /opt/voltaire/fca ]]; then CONFIGURE_OPTS+=" --with-fca=/opt/voltaire/fca" fi +src_prepare() { + patch -p1 < $SCLASS_DIR/$sit_classfile-0001.patch +} + src_pretest() { make check } diff --git a/packages/mpi/openmpi/openmpi-2.0.0-0001.patch b/packages/mpi/openmpi/openmpi-2.0.0-0001.patch new file mode 100644 index 0000000..6bf2d95 --- /dev/null +++ b/packages/mpi/openmpi/openmpi-2.0.0-0001.patch @@ -0,0 +1,218 @@ +From 4079eec9749e47dddc6acc9c0847b3091601919f Mon Sep 17 00:00:00 2001 +From: Nathan Hjelm +Date: Mon, 8 Aug 2016 11:33:45 -0600 +Subject: [PATCH] pml/ob1: be more selective when using rdma capable btls + +This commit updates the btl selection logic for the RDMA and RDMA +pipeline protocols to use a btl iff: 1) the btl is also used for eager +messages (high exclusivity), or 2) no other RDMA btl is available on +an endpoint and the pml_ob1_use_all_rdma MCA variable is true. This +fixes a performance regression with shared memory when an RDMA capable +network is available. + +Signed-off-by: Nathan Hjelm +--- + ompi/mca/pml/ob1/pml_ob1.h | 3 +- + ompi/mca/pml/ob1/pml_ob1_component.c | 6 +++ + ompi/mca/pml/ob1/pml_ob1_rdma.c | 79 ++++++++++++++++++++++++++++++++---- + ompi/mca/pml/ob1/pml_ob1_rdma.h | 6 +++ + ompi/mca/pml/ob1/pml_ob1_recvreq.c | 2 +- + 5 files changed, 85 insertions(+), 11 deletions(-) + +diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h +index 1f78c51..8f0f510 100644 +--- a/ompi/mca/pml/ob1/pml_ob1.h ++++ b/ompi/mca/pml/ob1/pml_ob1.h +@@ -12,7 +12,7 @@ + * All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. +- * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights ++ * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. +@@ -61,6 +61,7 @@ struct mca_pml_ob1_t { + int max_rdma_per_request; + int max_send_per_range; + bool leave_pinned; ++ bool use_all_rdma; + int leave_pinned_pipeline; + + /* lock queue access */ +diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c +index 5445f4b..e922c18 100644 +--- a/ompi/mca/pml/ob1/pml_ob1_component.c ++++ b/ompi/mca/pml/ob1/pml_ob1_component.c +@@ -198,6 +198,12 @@ static int mca_pml_ob1_component_register(void) + + mca_pml_ob1_param_register_uint("unexpected_limit", 128, &mca_pml_ob1.unexpected_limit); + ++ mca_pml_ob1.use_all_rdma = false; ++ (void) mca_base_component_var_register(&mca_pml_ob1_component.pmlm_version, "use_all_rdma", ++ "Use all available RDMA btls for the RDMA and RDMA pipeline protocols " ++ "(default: false)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, ++ OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_GROUP, &mca_pml_ob1.use_all_rdma); ++ + mca_pml_ob1.allocator_name = "bucket"; + (void) mca_base_component_var_register(&mca_pml_ob1_component.pmlm_version, "allocator", + "Name of allocator component for unexpected messages", +diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c b/ompi/mca/pml/ob1/pml_ob1_rdma.c +index 888e126..7513b24 100644 +--- a/ompi/mca/pml/ob1/pml_ob1_rdma.c ++++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c +@@ -10,7 +10,7 @@ + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. +- * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights ++ * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * +@@ -42,6 +42,7 @@ size_t mca_pml_ob1_rdma_btls( + mca_pml_ob1_com_btl_t* rdma_btls) + { + int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); ++ int num_eager_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager); + double weight_total = 0; + int num_btls_used = 0; + +@@ -57,6 +58,22 @@ size_t mca_pml_ob1_rdma_btls( + (bml_endpoint->btl_rdma_index + n) % num_btls); + mca_btl_base_registration_handle_t *reg_handle = NULL; + mca_btl_base_module_t *btl = bml_btl->btl; ++ /* NTH: go ahead and use an rdma btl if is the only one */ ++ bool ignore = !mca_pml_ob1.use_all_rdma; ++ ++ /* do not use rdma btls that are not in the eager list. this is necessary to avoid using ++ * btls that exist on the endpoint only to support RMA. */ ++ for (int i = 0 ; i < num_eager_btls && ignore ; ++i) { ++ mca_bml_base_btl_t *eager_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i); ++ if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) { ++ ignore = false; ++ break; ++ } ++ } ++ ++ if (ignore) { ++ continue; ++ } + + if (btl->btl_register_mem) { + /* do not use the RDMA protocol with this btl if 1) leave pinned is disabled, +@@ -95,22 +112,66 @@ size_t mca_pml_ob1_rdma_btls( + return num_btls_used; + } + ++size_t mca_pml_ob1_rdma_pipeline_btls_count (mca_bml_base_endpoint_t* bml_endpoint) ++{ ++ int num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma); ++ int num_eager_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager); ++ int rdma_count = 0; ++ ++ for(int i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; ++i) { ++ mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); ++ /* NTH: go ahead and use an rdma btl if is the only one */ ++ bool ignore = !mca_pml_ob1.use_all_rdma; ++ ++ for (int i = 0 ; i < num_eager_btls && ignore ; ++i) { ++ mca_bml_base_btl_t *eager_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i); ++ if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) { ++ ignore = false; ++ break; ++ } ++ } ++ ++ if (!ignore) { ++ ++rdma_count; ++ } ++ } ++ ++ return rdma_count; ++} ++ + size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint, + size_t size, + mca_pml_ob1_com_btl_t* rdma_btls ) + { +- int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); ++ int num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma); ++ int num_eager_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager); + double weight_total = 0; ++ int rdma_count = 0; ++ ++ for(int i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) { ++ mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); ++ /* NTH: go ahead and use an rdma btl if is the only one */ ++ bool ignore = !mca_pml_ob1.use_all_rdma; ++ ++ for (int i = 0 ; i < num_eager_btls && ignore ; ++i) { ++ mca_bml_base_btl_t *eager_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i); ++ if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) { ++ ignore = false; ++ break; ++ } ++ } ++ ++ if (ignore) { ++ continue; ++ } + +- for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) { +- rdma_btls[i].bml_btl = +- mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); +- rdma_btls[i].btl_reg = NULL; ++ rdma_btls[rdma_count].bml_btl = bml_btl; ++ rdma_btls[rdma_count++].btl_reg = NULL; + +- weight_total += rdma_btls[i].bml_btl->btl_weight; ++ weight_total += bml_btl->btl_weight; + } + +- mca_pml_ob1_calc_weighted_length(rdma_btls, i, size, weight_total); ++ mca_pml_ob1_calc_weighted_length (rdma_btls, rdma_count, size, weight_total); + +- return i; ++ return rdma_count; + } +diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.h b/ompi/mca/pml/ob1/pml_ob1_rdma.h +index 80e4fb2..7729043 100644 +--- a/ompi/mca/pml/ob1/pml_ob1_rdma.h ++++ b/ompi/mca/pml/ob1/pml_ob1_rdma.h +@@ -1,3 +1,4 @@ ++/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ + /* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology +@@ -9,6 +10,8 @@ + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. ++ * Copyright (c) 2016 Los Alamos National Security, LLC. All rights ++ * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow +@@ -37,5 +40,8 @@ size_t mca_pml_ob1_rdma_btls(struct mca_bml_base_endpoint_t* endpoint, + * bandwidth */ + size_t mca_pml_ob1_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint, + size_t size, mca_pml_ob1_com_btl_t* rdma_btls); ++ ++size_t mca_pml_ob1_rdma_pipeline_btls_count (mca_bml_base_endpoint_t* bml_endpoint); ++ + #endif + +diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c +index ef6d8a8..cfdb783 100644 +--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c ++++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c +@@ -263,7 +263,7 @@ static int mca_pml_ob1_recv_request_ack( + /* by default copy everything */ + recvreq->req_send_offset = bytes_received; + if(hdr->hdr_msg_length > bytes_received) { +- size_t rdma_num = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); ++ size_t rdma_num = mca_pml_ob1_rdma_pipeline_btls_count (bml_endpoint); + /* + * lookup request buffer to determine if memory is already + * registered.