Skip to content

Commit

Permalink
Merge pull request #9975 from bwbarrett/bugfix/rdma-osc
Browse files Browse the repository at this point in the history
osc/rdma: Handle remote completion behavior
  • Loading branch information
bwbarrett authored Feb 16, 2022
2 parents 44b1b8d + ad9fae9 commit d94e0bb
Show file tree
Hide file tree
Showing 14 changed files with 882 additions and 547 deletions.
4 changes: 4 additions & 0 deletions ompi/mca/osc/rdma/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
# Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2022 Amazon.com, Inc. or its affiliates.
# All Rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
Expand All @@ -21,6 +23,8 @@
rdma_sources = \
osc_rdma.h \
osc_rdma_module.c \
osc_rdma_btl_comm.h \
osc_rdma_btl_comm.c \
osc_rdma_comm.h \
osc_rdma_comm.c \
osc_rdma_accumulate.c \
Expand Down
87 changes: 55 additions & 32 deletions ompi/mca/osc/rdma/osc_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include "ompi/mca/osc/osc.h"
#include "ompi/mca/osc/base/base.h"
#include "opal/mca/btl/btl.h"
#include "opal/mca/btl/base/btl_base_am_rdma.h"
#include "ompi/memchecker.h"
#include "ompi/op/op.h"
#include "opal/align.h"
Expand All @@ -57,8 +58,6 @@

#define RANK_ARRAY_COUNT(module) ((ompi_comm_size ((module)->comm) + (module)->node_count - 1) / (module)->node_count)

#define MCA_OSC_RDMA_BTLS_SIZE_INIT 4

enum {
OMPI_OSC_RDMA_LOCKING_TWO_LEVEL,
OMPI_OSC_RDMA_LOCKING_ON_DEMAND,
Expand Down Expand Up @@ -150,9 +149,6 @@ struct ompi_osc_rdma_module_t {
/** value of same_size info key for this window */
bool same_size;

/** CPU atomics can be used */
bool use_cpu_atomics;

/** passive-target synchronization will not be used in this window */
bool no_locks;

Expand Down Expand Up @@ -260,18 +256,38 @@ struct ompi_osc_rdma_module_t {
/** lock for peer hash table/array */
opal_mutex_t peer_lock;

/* ******************* communication *********************** */

/* we currently support two modes of operation, a single
* accelerated btl (which can use memory registration and can use
* btl_flush() and one or more alternate btls, which cannot use
* flush() or rely on memory registration. Since it is an
* either/or situation, we use a union to simplify the code.
*/
bool use_accelerated_btl;

union {
struct {
mca_btl_base_module_t *accelerated_btl;
};
struct {
mca_btl_base_am_rdma_module_t **alternate_am_rdmas;
uint8_t alternate_btl_count;
};
};

/** Does the selected BTL require memory registration? This field
will be false when alternate BTLs are used, and the value
when an accelerated BTL is used depends on the registration
requirements of the underlying BTL. */
bool use_memory_registration;

/** BTL(s) in use. Currently this is only used to support RDMA emulation over
* non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this
* could be used to support multiple RDMA-capable BTLs but the memory registration
* paths will need to be updated to pack/unpack multiple registration handles. */
struct mca_btl_base_module_t **selected_btls;
uint8_t selected_btls_size;
uint8_t btls_in_use;
size_t put_alignment;
size_t get_alignment;
size_t put_limit;
size_t get_limit;

/** Only true if one BTL is in use. Memory registration is only supported when
* using a single BTL. */
bool use_memory_registration;
uint32_t atomic_flags;

/** registered fragment used for locally buffered RDMA transfers */
struct ompi_osc_rdma_frag_t *rdma_frag;
Expand Down Expand Up @@ -383,10 +399,11 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc
size_t size, uint32_t flags, mca_btl_base_registration_handle_t **handle, int line, const char *file)
{
if (module->use_memory_registration) {
assert(module->use_accelerated_btl);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "registering segment with btl. range: %p - %p (%lu bytes)",
ptr, (void*)((char *) ptr + size), size);

*handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0], endpoint, ptr, size, flags);
*handle = module->accelerated_btl->btl_register_mem(module->accelerated_btl, endpoint, ptr, size, flags);
if (OPAL_UNLIKELY(NULL == *handle)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to register pointer with selected BTL. base: %p, "
"size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line);
Expand All @@ -404,7 +421,9 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc
static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file)
{
if (handle) {
module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0], handle);
assert(module->use_memory_registration);
assert(module->use_accelerated_btl);
module->accelerated_btl->btl_deregister_mem(module->accelerated_btl, handle);
}
}

Expand Down Expand Up @@ -536,10 +555,11 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module)
{
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
return !!(module->selected_btls[0]->btl_flush);
#else
return false;
if (module->use_accelerated_btl) {
return (NULL != module->accelerated_btl->btl_flush);
}
#endif
return false;
}

/**
Expand Down Expand Up @@ -601,13 +621,13 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
opal_progress ();
} while (ompi_osc_rdma_sync_get_count (sync));
#else
mca_btl_base_module_t *btl_module = sync->module->selected_btls[0];

do {
if (!ompi_osc_rdma_use_btl_flush (sync->module)) {
opal_progress ();
} else {
btl_module->btl_flush (btl_module, NULL);
assert(sync->module->use_accelerated_btl);
mca_btl_base_module_t *btl_module = sync->module->accelerated_btl;
btl_module->btl_flush(btl_module, NULL);
}
} while (ompi_osc_rdma_sync_get_count (sync) || (sync->module->rdma_frag && (sync->module->rdma_frag->pending > 1)));
#endif
Expand Down Expand Up @@ -637,17 +657,20 @@ static inline bool ompi_osc_rdma_oor (int rc)

__opal_attribute_always_inline__
static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) {
return module->selected_btls[btl_index];
if (module->use_accelerated_btl) {
assert(0 == btl_index);
return module->accelerated_btl;
} else {
assert(btl_index < module->alternate_btl_count);
return module->alternate_am_rdmas[btl_index]->btl;
}
}

__opal_attribute_always_inline__
static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index) {
if(btl_index == module->selected_btls_size) {
module->selected_btls_size *= 2;
module->selected_btls = realloc(module->selected_btls, module->selected_btls_size * sizeof(struct mca_btl_base_module_t *));
assert(NULL != module->selected_btls);
}
module->selected_btls[btl_index] = btl;

static inline mca_btl_base_am_rdma_module_t *ompi_osc_rdma_selected_am_rdma(ompi_osc_rdma_module_t *module, uint8_t btl_index) {
assert(!module->use_accelerated_btl);
assert(btl_index < module->alternate_btl_count);
return module->alternate_am_rdmas[btl_index];
}

#endif /* OMPI_OSC_RDMA_H */
64 changes: 31 additions & 33 deletions ompi/mca/osc/rdma/osc_rdma_accumulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,24 @@
* Copyright (c) 2019-2021 Google, LLC. All rights reserved.
* Copyright (c) 2021 IBM Corporation. All rights reserved.
* Copyright (c) 2022 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates.
* All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/

#include "ompi_config.h"

#include "osc_rdma_accumulate.h"
#include "osc_rdma_request.h"
#include "osc_rdma_comm.h"
#include "osc_rdma_lock.h"
#include "osc_rdma_btl_comm.h"

#include "opal/util/minmax.h"
#include "ompi/mca/osc/base/base.h"
#include "ompi/mca/osc/base/osc_base_obj_convert.h"

Expand Down Expand Up @@ -157,13 +164,11 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const
mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int32_t atomic_flags = selected_btl->btl_atomic_flags;
int btl_op, flags;
int64_t origin;

if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) ||
(!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) ||
if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags) && 4 == extent)) ||
(!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & module->atomic_flags)) ||
!ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) {
return OMPI_ERR_NOT_SUPPORTED;
}
Expand Down Expand Up @@ -235,19 +240,11 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
ompi_op_t *op, ompi_osc_rdma_request_t *req)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int32_t atomic_flags = selected_btl->btl_atomic_flags;
int btl_op, flags;
int64_t origin;

if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
/* btl put atomics not supported or disabled. fall back on fetch-and-op */
return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle,
op, req);
}

if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) ||
(!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) ||
if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags) && 4 == extent)) ||
(!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & module->atomic_flags)) ||
!ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) {
return OMPI_ERR_NOT_SUPPORTED;
}
Expand Down Expand Up @@ -585,9 +582,9 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v

/* determine how much to put in this operation */
if (source_count) {
acc_len = min(min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit);
acc_len = opal_min(opal_min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit);
} else {
acc_len = min(target_iovec[target_iov_index].iov_len, acc_limit);
acc_len = opal_min(target_iovec[target_iov_index].iov_len, acc_limit);
}

if (0 != acc_len) {
Expand Down Expand Up @@ -662,13 +659,11 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
bool lock_acquired)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int32_t atomic_flags = btl->btl_atomic_flags;
const size_t size = datatype->super.size;
int64_t compare, source;
int flags, ret;

if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags))) {
if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags))) {
return OMPI_ERR_NOT_SUPPORTED;
}

Expand Down Expand Up @@ -716,7 +711,6 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
mca_btl_base_registration_handle_t *target_handle, bool lock_acquired)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
unsigned long len = datatype->super.size;
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
Expand All @@ -741,26 +735,30 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
return OMPI_SUCCESS;
}

if (btl->btl_register_mem && len > btl->btl_put_local_registration_threshold) {
do {
ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr);
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
break;
}
if (module->use_memory_registration) {
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
if (len > btl->btl_put_local_registration_threshold) {
do {
ret = ompi_osc_rdma_frag_alloc(module, len, &frag, &ptr);
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
break;
}

ompi_osc_rdma_progress (module);
} while (1);
ompi_osc_rdma_progress (module);
} while (1);

memcpy (ptr, source_addr, len);
local_handle = frag->handle;
memcpy(ptr, source_addr, len);
local_handle = frag->handle;
}
}

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put...");

do {
ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address,
local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
ret = ompi_osc_rdma_btl_put(module, peer->data_btl_index, peer->data_endpoint,
ptr, target_address, local_handle, target_handle,
len, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) {
break;
}
Expand Down
27 changes: 0 additions & 27 deletions ompi/mca/osc/rdma/osc_rdma_active_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,33 +77,6 @@ OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_op_t, opal_list_item_t,
ompi_osc_rdma_pending_op_construct,
ompi_osc_rdma_pending_op_destruct);

/**
* Dummy completion function for atomic operations
*/
void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context;

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", (void*)pending_op, status);

if (pending_op->op_result) {
memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size);
}

if (NULL != pending_op->cbfunc) {
pending_op->cbfunc (pending_op->cbdata, pending_op->cbcontext, status);
}

if (NULL != pending_op->op_frag) {
ompi_osc_rdma_frag_complete (pending_op->op_frag);
pending_op->op_frag = NULL;
}

pending_op->op_complete = true;
OBJ_RELEASE(pending_op);
}

/**
* compare_ranks:
Expand Down
Loading

0 comments on commit d94e0bb

Please sign in to comment.