From 4bb33a2b29571706b646015163257bb67f08c9db Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 11 Jan 2022 00:06:11 +0000 Subject: [PATCH] osc/rdma: Use BTL am-rdma explicit interface Switch from using the implicit BTL interface (where the am-rdma interface just extends missing functionality in the BTL) to the new explicit interface (where the OSC RDMA interface is the only maintainer of the BTL list. With this change, alternate BTLs do not have to support REMOTE_COMPLETION to be selected (because the AM RDMA interface always provides remote completion when we request it, as this patch does). Any BTL that supports Active Messages (ie, all of them) should be able to support the OSC RDMA required semantics, eliminating the problem of creating windows with no servicable BTLs. Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/osc_rdma.h | 29 +++- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 40 +++--- ompi/mca/osc/rdma/osc_rdma_btl_comm.h | 77 ++++++++--- ompi/mca/osc/rdma/osc_rdma_comm.c | 174 ++++++++++++------------ ompi/mca/osc/rdma/osc_rdma_component.c | 63 +++++++-- ompi/mca/osc/rdma/osc_rdma_module.c | 5 +- 6 files changed, 240 insertions(+), 148 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index c70dacd4b0d..94b6cb641c8 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -44,6 +44,7 @@ #include "ompi/mca/osc/osc.h" #include "ompi/mca/osc/base/base.h" #include "opal/mca/btl/btl.h" +#include "opal/mca/btl/base/btl_base_am_rdma.h" #include "ompi/memchecker.h" #include "ompi/op/op.h" #include "opal/align.h" @@ -255,6 +256,8 @@ struct ompi_osc_rdma_module_t { /** lock for peer hash table/array */ opal_mutex_t peer_lock; + /* ******************* communication *********************** */ + /* we currently support two modes of operation, a single * accelerated btl (which can use memory registration and can use * btl_flush() and one or more alternate btls, which cannot use @@ -265,18 +268,27 @@ struct ompi_osc_rdma_module_t { union { struct { - struct mca_btl_base_module_t *accelerated_btl; + mca_btl_base_module_t *accelerated_btl; }; struct { - struct mca_btl_base_module_t **alternate_btls; + mca_btl_base_am_rdma_module_t **alternate_am_rdmas; uint8_t alternate_btl_count; }; }; - /** Only true if one BTL is in use. Memory registration is only supported when - * using a single BTL. */ + /** Does the selected BTL require memory registration? This field + will be false when alternate BTLs are used, and the value + when an accelerated BTL is used depends on the registration + requirements of the underlying BTL. */ bool use_memory_registration; + size_t put_alignment; + size_t get_alignment; + size_t put_limit; + size_t get_limit; + + uint32_t atomic_flags; + /** registered fragment used for locally buffered RDMA transfers */ struct ompi_osc_rdma_frag_t *rdma_frag; @@ -650,8 +662,15 @@ static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_m return module->accelerated_btl; } else { assert(btl_index < module->alternate_btl_count); - return module->alternate_btls[btl_index]; + return module->alternate_am_rdmas[btl_index]->btl; } } + +static inline mca_btl_base_am_rdma_module_t *ompi_osc_rdma_selected_am_rdma(ompi_osc_rdma_module_t *module, uint8_t btl_index) { + assert(!module->use_accelerated_btl); + assert(btl_index < module->alternate_btl_count); + return module->alternate_am_rdmas[btl_index]; +} + #endif /* OMPI_OSC_RDMA_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 41d204ed059..0cec49a1d80 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -164,13 +164,11 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; - if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || - (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & module->atomic_flags)) || !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { return OMPI_ERR_NOT_SUPPORTED; } @@ -242,13 +240,11 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; - if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || - (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & module->atomic_flags)) || !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { return OMPI_ERR_NOT_SUPPORTED; } @@ -663,13 +659,11 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - int32_t atomic_flags = btl->btl_atomic_flags; const size_t size = datatype->super.size; int64_t compare, source; int flags, ret; - if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags))) { + if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags))) { return OMPI_ERR_NOT_SUPPORTED; } @@ -717,7 +711,6 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, mca_btl_base_registration_handle_t *target_handle, bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); unsigned long len = datatype->super.size; mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -742,18 +735,21 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, return OMPI_SUCCESS; } - if (btl->btl_register_mem && len > btl->btl_put_local_registration_threshold) { - do { - ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { - break; - } + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + if (len > btl->btl_put_local_registration_threshold) { + do { + ret = ompi_osc_rdma_frag_alloc(module, len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { + break; + } - ompi_osc_rdma_progress (module); - } while (1); + ompi_osc_rdma_progress (module); + } while (1); - memcpy (ptr, source_addr, len); - local_handle = frag->handle; + memcpy(ptr, source_addr, len); + local_handle = frag->handle; + } } OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put..."); diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h index a666d77710b..718036d0a76 100644 --- a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h @@ -36,11 +36,17 @@ ompi_osc_rdma_btl_put(ompi_osc_rdma_module_t *module, uint8_t btl_index, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); - - return btl->btl_put(btl, endpoint, local_address, remote_address, - local_handle, remote_handle, size, flags, order, - cbfunc, cbcontext, cbdata); + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_put(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_put(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } } @@ -54,11 +60,18 @@ ompi_osc_rdma_btl_get(ompi_osc_rdma_module_t *module, uint8_t btl_index, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); - return btl->btl_get(btl, endpoint, local_address, remote_address, - local_handle, remote_handle, size, flags, order, - cbfunc, cbcontext, cbdata); + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_get(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_get(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } } @@ -71,6 +84,9 @@ ompi_osc_rdma_btl_atomic_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, { mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + /* the AM BTL interface does not currently support op calls */ + assert(module->use_accelerated_btl); + return btl->btl_atomic_op(btl, endpoint, remote_address, remote_handle, op, operand, flags, order, cbfunc, cbcontext, cbdata); @@ -87,12 +103,19 @@ ompi_osc_rdma_btl_atomic_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); - - return btl->btl_atomic_fop(btl, endpoint, local_address, remote_address, - local_handle, remote_handle, - op, operand, flags, order, - cbfunc, cbcontext, cbdata); + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_atomic_fop(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_atomic_fop(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); + } } @@ -105,12 +128,19 @@ ompi_osc_rdma_btl_atomic_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index uint64_t compare, uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); - - return btl->btl_atomic_cswap(btl, endpoint, local_address, remote_address, - local_handle, remote_handle, - compare, value, flags, order, - cbfunc, cbcontext, cbdata); + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_atomic_cswap(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, + compare, value, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_atomic_cswap(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, + compare, value, flags, order, + cbfunc, cbcontext, cbdata); + } } @@ -195,7 +225,10 @@ ompi_osc_rdma_btl_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret; - if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { + /* if using the AM RDMA interface with alternate BTLs or if the + accelerated BTL does not support atomic ops, emulate the atomic + op over a fetch and atomic op */ + if (!module->use_accelerated_btl || !(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { return ompi_osc_rdma_btl_fop (module, btl_index, endpoint, address, address_handle, op, operand, flags, NULL, wait_for_completion, cbfunc, cbdata, cbcontext); } diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 17448892870..62f35d28d64 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -65,8 +65,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde struct mca_btl_base_endpoint_t *endpoint, uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *data, size_t len) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, btl_index); - const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); + const size_t btl_alignment_mask = ALIGNMENT_MASK(module->get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; volatile bool read_complete = false; @@ -82,25 +81,28 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde "), len: %lu (aligned: %lu)", (void *) endpoint, source_address, aligned_addr, (unsigned long) len, (unsigned long) aligned_len); - if (btl->btl_register_mem && len >= btl->btl_get_local_registration_threshold) { - do { - ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret)) { - ompi_osc_rdma_progress (module); + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + if (len >= btl->btl_get_local_registration_threshold) { + do { + ret = ompi_osc_rdma_frag_alloc(module, aligned_len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret)) { + ompi_osc_rdma_progress(module); + } + } while (OMPI_ERR_OUT_OF_RESOURCE == ret); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "error allocating temporary buffer"); + return ret; } - } while (OMPI_ERR_OUT_OF_RESOURCE == ret); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "error allocating temporary buffer"); - return ret; + local_handle = frag->handle; + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocated temporary buffer %p in fragment %p", (void*)ptr, + (void *) frag); } - - local_handle = frag->handle; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocated temporary buffer %p in fragment %p", (void*)ptr, - (void *) frag); } - assert (!(source_address & ALIGNMENT_MASK(btl->btl_get_alignment))); + assert (!(source_address & ALIGNMENT_MASK(module->get_alignment))); do { ret = ompi_osc_rdma_btl_get(module, btl_index, endpoint, ptr, aligned_addr, @@ -487,7 +489,6 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t * ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *local_handle = NULL; mca_btl_base_rdma_completion_fn_t cbfunc = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -495,16 +496,19 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t * void *cbcontext; int ret; - if (btl->btl_register_mem && size > btl->btl_put_local_registration_threshold) { - ret = ompi_osc_rdma_frag_alloc (module, size, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ret = ompi_osc_rdma_register (module, peer->data_endpoint, source_buffer, size, 0, &local_handle); + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, peer->data_btl_index); + if (size > btl->btl_put_local_registration_threshold) { + ret = ompi_osc_rdma_frag_alloc(module, size, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; + ret = ompi_osc_rdma_register(module, peer->data_endpoint, source_buffer, size, 0, &local_handle); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } else { + memcpy(ptr, source_buffer, size); + local_handle = frag->handle; } - } else { - memcpy (ptr, source_buffer, size); - local_handle = frag->handle; } } @@ -606,8 +610,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); + const size_t btl_alignment_mask = ALIGNMENT_MASK(module->get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; osc_rdma_size_t aligned_len; @@ -623,70 +626,73 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating get of %lu bytes from remote ptr %" PRIx64 " to local ptr %p", size, source_address, target_buffer); - if ((btl->btl_register_mem && size > btl->btl_get_local_registration_threshold) || - (((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - - ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { - /* region is too large for a buffered read */ - size_t subsize; + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, peer->data_btl_index); + if (size > btl->btl_get_local_registration_threshold || + (((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) { - /* remote region has the same alignment but the base is not aligned. perform a small - * buffered get of the beginning of the remote region */ - aligned_source_base = OPAL_ALIGN(source_address, btl->btl_get_alignment, osc_rdma_base_t); - subsize = (size_t) (aligned_source_base - source_address); - - ret = ompi_osc_rdma_get_partial (sync, peer, source_address, source_handle, target_buffer, subsize, request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; + ret = ompi_osc_rdma_frag_alloc(module, aligned_len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { + /* region is too large for a buffered read */ + size_t subsize; + + if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) { + /* remote region has the same alignment but the base is not aligned. perform a small + * buffered get of the beginning of the remote region */ + aligned_source_base = OPAL_ALIGN(source_address, btl->btl_get_alignment, osc_rdma_base_t); + subsize = (size_t) (aligned_source_base - source_address); + + ret = ompi_osc_rdma_get_partial(sync, peer, source_address, source_handle, target_buffer, subsize, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + source_address += subsize; + target_buffer = (void *) ((intptr_t) target_buffer + subsize); + size -= subsize; + + aligned_len = aligned_source_bound - aligned_source_base; } - source_address += subsize; - target_buffer = (void *) ((intptr_t) target_buffer + subsize); - size -= subsize; + if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) && + (size & btl_alignment_mask)) { + /* remote region bases are aligned but the bounds are not. perform a + * small buffered get of the end of the remote region */ + aligned_len = size & ~btl_alignment_mask; + subsize = size - aligned_len; + size = aligned_len; + ret = ompi_osc_rdma_get_partial(sync, peer, source_address + aligned_len, source_handle, + (void *) ((intptr_t) target_buffer + aligned_len), subsize, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } + /* (remaining) user request is now correctly aligned */ + } - aligned_len = aligned_source_bound - aligned_source_base; + if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { + /* local and remote alignments differ */ + request->buffer = ptr = malloc(aligned_len); + } else { + ptr = target_buffer; } - if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) && - (size & btl_alignment_mask)) { - /* remote region bases are aligned but the bounds are not. perform a - * small buffered get of the end of the remote region */ - aligned_len = size & ~btl_alignment_mask; - subsize = size - aligned_len; - size = aligned_len; - ret = ompi_osc_rdma_get_partial (sync, peer, source_address + aligned_len, source_handle, - (void *) ((intptr_t) target_buffer + aligned_len), subsize, request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } + if (NULL != ptr) { + (void)ompi_osc_rdma_register(module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE, + &local_handle); } - /* (remaining) user request is now correctly aligned */ - } - if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - /* local and remote alignments differ */ - request->buffer = ptr = malloc (aligned_len); + if (OPAL_UNLIKELY(NULL == local_handle)) { + free(request->buffer); + request->buffer = NULL; + return ret; + } } else { - ptr = target_buffer; - } - - if (NULL != ptr) { - (void) ompi_osc_rdma_register (module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE, - &local_handle); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx", + (void*)ptr, (void *) frag, (unsigned long) aligned_len, (unsigned long) aligned_source_base); + local_handle = frag->handle; } - - if (OPAL_UNLIKELY(NULL == local_handle)) { - free (request->buffer); - request->buffer = NULL; - return ret; - } - } else { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx", - (void*)ptr, (void *) frag, (unsigned long) aligned_len, (unsigned long) aligned_source_base); - local_handle = frag->handle; } } @@ -742,7 +748,6 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *target_handle; uint64_t target_address; int ret; @@ -777,7 +782,7 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi return ompi_osc_rdma_master (sync, (void *) origin_addr, origin_count, origin_datatype, peer, target_address, target_handle, target_count, target_datatype, request, - btl->btl_put_limit, ompi_osc_rdma_put_contig, false); + module->put_limit, ompi_osc_rdma_put_contig, false); } static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, @@ -785,7 +790,6 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *source_handle; uint64_t source_address; ptrdiff_t source_span, source_lb; @@ -818,7 +822,7 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori return ompi_osc_rdma_master (sync, origin_addr, origin_count, origin_datatype, peer, source_address, source_handle, source_count, source_datatype, request, - btl->btl_get_limit, ompi_osc_rdma_get_contig, true); + module->get_limit, ompi_osc_rdma_get_contig, true); } int ompi_osc_rdma_put (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, ptrdiff_t target_disp, int target_count, diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 8c7fc87d873..63030bafb1d 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -51,6 +51,7 @@ #include "opal/util/argv.h" #include "opal/util/printf.h" #include "opal/util/sys_limits.h" +#include "opal/util/minmax.h" #if OPAL_CUDA_SUPPORT #include "opal/mca/common/cuda/common_cuda.h" #endif /* OPAL_CUDA_SUPPORT */ @@ -586,7 +587,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s use_cpu_atomics = !!(module->accelerated_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); } else { for (int i = 0 ; i < module->alternate_btl_count ; ++i) { - use_cpu_atomics &= !!(module->alternate_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + use_cpu_atomics &= !!(module->alternate_am_rdmas[i]->btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); } } } @@ -882,12 +883,14 @@ static void ompi_osc_rdma_ensure_local_add_procs (void) */ static int btl_latency_sort_fn(const void *a, const void *b) { - const struct mca_btl_base_module_t *btl_a = a; - const struct mca_btl_base_module_t *btl_b = b; + const mca_btl_base_am_rdma_module_t * const *am_rdma_a_p = a; + const mca_btl_base_am_rdma_module_t * const *am_rdma_b_p = b; + const mca_btl_base_am_rdma_module_t *am_rdma_a = *am_rdma_a_p; + const mca_btl_base_am_rdma_module_t *am_rdma_b = *am_rdma_b_p; - if (btl_a->btl_latency < btl_b->btl_latency) { + if (am_rdma_a->btl->btl_latency < am_rdma_b->btl->btl_latency) { return -1; - } else if (btl_a->btl_latency == btl_b->btl_latency) { + } else if (am_rdma_a->btl->btl_latency == am_rdma_b->btl->btl_latency) { return 0; } else { return 1; @@ -924,14 +927,19 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o mca_btl_base_selected_module_t *item; int ret; + module->put_alignment = 1; + module->get_alignment = 1; + module->put_limit = SIZE_MAX; + module->get_limit = SIZE_MAX; + btl_count = opal_list_get_size(&mca_btl_base_modules_initialized); if (btl_count > UINT8_MAX) { return OMPI_ERROR; } module->alternate_btl_count = btl_count; - module->alternate_btls = malloc(sizeof(struct mca_btl_base_module_t *) * btl_count); - if (NULL == module->alternate_btls) { + module->alternate_am_rdmas = malloc(sizeof(struct mca_btl_base_am_rdma_module_t *) * module->alternate_btl_count); + if (NULL == module->alternate_am_rdmas) { return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } @@ -951,19 +959,43 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, "found alternate btl %s", item->btl_module->btl_component->btl_version.mca_component_name); - ret = mca_btl_base_am_rdma_init(item->btl_module); + + ret = opal_btl_base_am_rdma_create(item->btl_module, + MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION, + true /* no_memory_registration */, + &(module->alternate_am_rdmas[index])); if (OMPI_SUCCESS != ret) { return ret; } - module->alternate_btls[index++] = item->btl_module; + + module->put_alignment = opal_max(module->put_alignment, + module->alternate_am_rdmas[index]->am_btl_put_alignment); + module->get_alignment = opal_max(module->get_alignment, + module->alternate_am_rdmas[index]->am_btl_get_alignment); + module->put_limit = opal_min(module->put_limit, + module->alternate_am_rdmas[index]->am_btl_put_limit); + module->get_limit = opal_min(module->get_limit, + module->alternate_am_rdmas[index]->am_btl_get_limit); + + index++; } - assert(index == btl_count); + assert(index == module->alternate_btl_count); /* sort based on latency, lowest first */ - qsort(module->alternate_btls, module->alternate_btl_count, - sizeof(struct mca_btl_base_module_t*), btl_latency_sort_fn); + qsort(module->alternate_am_rdmas, module->alternate_btl_count, + sizeof(module->alternate_am_rdmas[0]), btl_latency_sort_fn); module->use_memory_registration = false; + module->atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | + MCA_BTL_ATOMIC_SUPPORTS_AND | + MCA_BTL_ATOMIC_SUPPORTS_OR | + MCA_BTL_ATOMIC_SUPPORTS_XOR | + MCA_BTL_ATOMIC_SUPPORTS_SWAP | + MCA_BTL_ATOMIC_SUPPORTS_MIN | + MCA_BTL_ATOMIC_SUPPORTS_MAX | + MCA_BTL_ATOMIC_SUPPORTS_32BIT | + MCA_BTL_ATOMIC_SUPPORTS_CSWAP | + MCA_BTL_ATOMIC_SUPPORTS_GLOB; return OMPI_SUCCESS; } @@ -1126,7 +1158,12 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi btl_selection_complete: module->use_accelerated_btl = true; module->accelerated_btl = selected_btl; - module->use_memory_registration = selected_btl->btl_register_mem != NULL; + module->use_memory_registration = (selected_btl->btl_register_mem != NULL); + module->put_alignment = selected_btl->btl_put_alignment; + module->get_alignment = selected_btl->btl_get_alignment; + module->put_limit = selected_btl->btl_put_limit; + module->get_limit = selected_btl->btl_get_limit; + module->atomic_flags = selected_btl->btl_atomic_flags; opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, "accelerated_query: selected btl: %s", diff --git a/ompi/mca/osc/rdma/osc_rdma_module.c b/ompi/mca/osc/rdma/osc_rdma_module.c index 648b61414e6..8a080e1e4eb 100644 --- a/ompi/mca/osc/rdma/osc_rdma_module.c +++ b/ompi/mca/osc/rdma/osc_rdma_module.c @@ -145,7 +145,10 @@ int ompi_osc_rdma_free(ompi_win_t *win) mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module, module->free_after); if (!module->use_accelerated_btl) { - free(module->alternate_btls); + for (int i = 0 ; i < module->alternate_btl_count ; ++i) { + OBJ_RELEASE(module->alternate_am_rdmas[i]); + } + free(module->alternate_am_rdmas); } free (module);