diff --git a/ompi/mca/osc/rdma/Makefile.am b/ompi/mca/osc/rdma/Makefile.am index e52d0087743..4757ce6aa93 100644 --- a/ompi/mca/osc/rdma/Makefile.am +++ b/ompi/mca/osc/rdma/Makefile.am @@ -11,6 +11,8 @@ # Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights # reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved. +# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. +# All Rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -21,6 +23,8 @@ rdma_sources = \ osc_rdma.h \ osc_rdma_module.c \ + osc_rdma_btl_comm.h \ + osc_rdma_btl_comm.c \ osc_rdma_comm.h \ osc_rdma_comm.c \ osc_rdma_accumulate.c \ diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 2a8aeae156d..94b6cb641c8 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -44,6 +44,7 @@ #include "ompi/mca/osc/osc.h" #include "ompi/mca/osc/base/base.h" #include "opal/mca/btl/btl.h" +#include "opal/mca/btl/base/btl_base_am_rdma.h" #include "ompi/memchecker.h" #include "ompi/op/op.h" #include "opal/align.h" @@ -57,8 +58,6 @@ #define RANK_ARRAY_COUNT(module) ((ompi_comm_size ((module)->comm) + (module)->node_count - 1) / (module)->node_count) -#define MCA_OSC_RDMA_BTLS_SIZE_INIT 4 - enum { OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, OMPI_OSC_RDMA_LOCKING_ON_DEMAND, @@ -150,9 +149,6 @@ struct ompi_osc_rdma_module_t { /** value of same_size info key for this window */ bool same_size; - /** CPU atomics can be used */ - bool use_cpu_atomics; - /** passive-target synchronization will not be used in this window */ bool no_locks; @@ -260,18 +256,38 @@ struct ompi_osc_rdma_module_t { /** lock for peer hash table/array */ opal_mutex_t peer_lock; + /* ******************* communication *********************** */ + + /* we currently support two modes of operation, a single + * accelerated btl (which can use memory registration and can use + * btl_flush() and one or more alternate btls, which cannot use + * flush() or rely on memory registration. Since it is an + * either/or situation, we use a union to simplify the code. + */ + bool use_accelerated_btl; + + union { + struct { + mca_btl_base_module_t *accelerated_btl; + }; + struct { + mca_btl_base_am_rdma_module_t **alternate_am_rdmas; + uint8_t alternate_btl_count; + }; + }; + + /** Does the selected BTL require memory registration? This field + will be false when alternate BTLs are used, and the value + when an accelerated BTL is used depends on the registration + requirements of the underlying BTL. */ + bool use_memory_registration; - /** BTL(s) in use. Currently this is only used to support RDMA emulation over - * non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this - * could be used to support multiple RDMA-capable BTLs but the memory registration - * paths will need to be updated to pack/unpack multiple registration handles. */ - struct mca_btl_base_module_t **selected_btls; - uint8_t selected_btls_size; - uint8_t btls_in_use; + size_t put_alignment; + size_t get_alignment; + size_t put_limit; + size_t get_limit; - /** Only true if one BTL is in use. Memory registration is only supported when - * using a single BTL. */ - bool use_memory_registration; + uint32_t atomic_flags; /** registered fragment used for locally buffered RDMA transfers */ struct ompi_osc_rdma_frag_t *rdma_frag; @@ -383,10 +399,11 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc size_t size, uint32_t flags, mca_btl_base_registration_handle_t **handle, int line, const char *file) { if (module->use_memory_registration) { + assert(module->use_accelerated_btl); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "registering segment with btl. range: %p - %p (%lu bytes)", ptr, (void*)((char *) ptr + size), size); - *handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0], endpoint, ptr, size, flags); + *handle = module->accelerated_btl->btl_register_mem(module->accelerated_btl, endpoint, ptr, size, flags); if (OPAL_UNLIKELY(NULL == *handle)) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to register pointer with selected BTL. base: %p, " "size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line); @@ -404,7 +421,9 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file) { if (handle) { - module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0], handle); + assert(module->use_memory_registration); + assert(module->use_accelerated_btl); + module->accelerated_btl->btl_deregister_mem(module->accelerated_btl, handle); } } @@ -536,10 +555,11 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module) { #if defined(BTL_VERSION) && (BTL_VERSION >= 310) - return !!(module->selected_btls[0]->btl_flush); -#else - return false; + if (module->use_accelerated_btl) { + return (NULL != module->accelerated_btl->btl_flush); + } #endif + return false; } /** @@ -601,13 +621,13 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync) opal_progress (); } while (ompi_osc_rdma_sync_get_count (sync)); #else - mca_btl_base_module_t *btl_module = sync->module->selected_btls[0]; - do { if (!ompi_osc_rdma_use_btl_flush (sync->module)) { opal_progress (); } else { - btl_module->btl_flush (btl_module, NULL); + assert(sync->module->use_accelerated_btl); + mca_btl_base_module_t *btl_module = sync->module->accelerated_btl; + btl_module->btl_flush(btl_module, NULL); } } while (ompi_osc_rdma_sync_get_count (sync) || (sync->module->rdma_frag && (sync->module->rdma_frag->pending > 1))); #endif @@ -637,17 +657,20 @@ static inline bool ompi_osc_rdma_oor (int rc) __opal_attribute_always_inline__ static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) { - return module->selected_btls[btl_index]; + if (module->use_accelerated_btl) { + assert(0 == btl_index); + return module->accelerated_btl; + } else { + assert(btl_index < module->alternate_btl_count); + return module->alternate_am_rdmas[btl_index]->btl; + } } -__opal_attribute_always_inline__ -static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index) { - if(btl_index == module->selected_btls_size) { - module->selected_btls_size *= 2; - module->selected_btls = realloc(module->selected_btls, module->selected_btls_size * sizeof(struct mca_btl_base_module_t *)); - assert(NULL != module->selected_btls); - } - module->selected_btls[btl_index] = btl; + +static inline mca_btl_base_am_rdma_module_t *ompi_osc_rdma_selected_am_rdma(ompi_osc_rdma_module_t *module, uint8_t btl_index) { + assert(!module->use_accelerated_btl); + assert(btl_index < module->alternate_btl_count); + return module->alternate_am_rdmas[btl_index]; } #endif /* OMPI_OSC_RDMA_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index ab0b21e539a..0cec49a1d80 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -10,6 +10,8 @@ * Copyright (c) 2019-2021 Google, LLC. All rights reserved. * Copyright (c) 2021 IBM Corporation. All rights reserved. * Copyright (c) 2022 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,10 +19,15 @@ * $HEADER$ */ +#include "ompi_config.h" + #include "osc_rdma_accumulate.h" #include "osc_rdma_request.h" #include "osc_rdma_comm.h" +#include "osc_rdma_lock.h" +#include "osc_rdma_btl_comm.h" +#include "opal/util/minmax.h" #include "ompi/mca/osc/base/base.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h" @@ -157,13 +164,11 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; - if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || - (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & module->atomic_flags)) || !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { return OMPI_ERR_NOT_SUPPORTED; } @@ -235,19 +240,11 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; - if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { - /* btl put atomics not supported or disabled. fall back on fetch-and-op */ - return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle, - op, req); - } - - if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || - (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & module->atomic_flags)) || !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { return OMPI_ERR_NOT_SUPPORTED; } @@ -585,9 +582,9 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v /* determine how much to put in this operation */ if (source_count) { - acc_len = min(min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit); + acc_len = opal_min(opal_min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit); } else { - acc_len = min(target_iovec[target_iov_index].iov_len, acc_limit); + acc_len = opal_min(target_iovec[target_iov_index].iov_len, acc_limit); } if (0 != acc_len) { @@ -662,13 +659,11 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - int32_t atomic_flags = btl->btl_atomic_flags; const size_t size = datatype->super.size; int64_t compare, source; int flags, ret; - if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags))) { + if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags))) { return OMPI_ERR_NOT_SUPPORTED; } @@ -716,7 +711,6 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, mca_btl_base_registration_handle_t *target_handle, bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); unsigned long len = datatype->super.size; mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -741,26 +735,30 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, return OMPI_SUCCESS; } - if (btl->btl_register_mem && len > btl->btl_put_local_registration_threshold) { - do { - ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { - break; - } + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + if (len > btl->btl_put_local_registration_threshold) { + do { + ret = ompi_osc_rdma_frag_alloc(module, len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { + break; + } - ompi_osc_rdma_progress (module); - } while (1); + ompi_osc_rdma_progress (module); + } while (1); - memcpy (ptr, source_addr, len); - local_handle = frag->handle; + memcpy(ptr, source_addr, len); + local_handle = frag->handle; + } } OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put..."); do { - ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address, - local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL); + ret = ompi_osc_rdma_btl_put(module, peer->data_btl_index, peer->data_endpoint, + ptr, target_address, local_handle, target_handle, + len, 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL); if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) { break; } diff --git a/ompi/mca/osc/rdma/osc_rdma_active_target.c b/ompi/mca/osc/rdma/osc_rdma_active_target.c index fdd3dd5c832..3706a098f77 100644 --- a/ompi/mca/osc/rdma/osc_rdma_active_target.c +++ b/ompi/mca/osc/rdma/osc_rdma_active_target.c @@ -77,33 +77,6 @@ OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_op_t, opal_list_item_t, ompi_osc_rdma_pending_op_construct, ompi_osc_rdma_pending_op_destruct); -/** - * Dummy completion function for atomic operations - */ -void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - void *context, void *data, int status) -{ - ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context; - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", (void*)pending_op, status); - - if (pending_op->op_result) { - memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size); - } - - if (NULL != pending_op->cbfunc) { - pending_op->cbfunc (pending_op->cbdata, pending_op->cbcontext, status); - } - - if (NULL != pending_op->op_frag) { - ompi_osc_rdma_frag_complete (pending_op->op_frag); - pending_op->op_frag = NULL; - } - - pending_op->op_complete = true; - OBJ_RELEASE(pending_op); -} /** * compare_ranks: diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_comm.c b/ompi/mca/osc/rdma/osc_rdma_btl_comm.c new file mode 100644 index 00000000000..cbc8a761593 --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_btl_comm.c @@ -0,0 +1,61 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2017-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2021 Google, LLC. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "osc_rdma.h" +#include "osc_rdma_frag.h" +#include "osc_rdma_btl_comm.h" + +#include "opal/mca/btl/base/base.h" + +void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status) +{ + ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", (void*)pending_op, status); + + if (pending_op->op_result) { + memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size); + } + + if (NULL != pending_op->cbfunc) { + pending_op->cbfunc (pending_op->cbdata, pending_op->cbcontext, status); + } + + if (NULL != pending_op->op_frag) { + ompi_osc_rdma_frag_complete (pending_op->op_frag); + pending_op->op_frag = NULL; + } + + pending_op->op_complete = true; + OBJ_RELEASE(pending_op); +} diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h new file mode 100644 index 00000000000..718036d0a76 --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2019 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2021 Google, LLC. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSC_RDMA_BTL_COMM_H +#define OSC_RDMA_BTL_COMM_H + +#include "osc_rdma_frag.h" + +#include "opal/mca/btl/btl.h" + + +void ompi_osc_rdma_atomic_complete(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status); + + +static inline int +ompi_osc_rdma_btl_put(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_put(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_put(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } +} + + +static inline int +ompi_osc_rdma_btl_get(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_get(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_get(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } +} + + +static inline int +ompi_osc_rdma_btl_atomic_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + + /* the AM BTL interface does not currently support op calls */ + assert(module->use_accelerated_btl); + + return btl->btl_atomic_op(btl, endpoint, remote_address, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); +} + + +static inline int +ompi_osc_rdma_btl_atomic_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) + +{ + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_atomic_fop(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_atomic_fop(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); + } +} + + +static inline int +ompi_osc_rdma_btl_atomic_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + uint64_t compare, uint64_t value, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_atomic_cswap(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, + compare, value, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_atomic_cswap(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, + compare, value, flags, order, + cbfunc, cbcontext, cbdata); + } +} + + +static inline int +ompi_osc_rdma_btl_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, uint64_t address, + mca_btl_base_registration_handle_t *address_handle, int op, + int64_t operand, int flags, int64_t *result, const bool wait_for_completion, + ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) +{ + ompi_osc_rdma_pending_op_t *pending_op; + mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); + int ret = OPAL_ERROR; + + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + + if (!wait_for_completion) { + /* NTH: need to keep track of pending ops to avoid a potential teardown problem */ + pending_op->module = module; + (void) opal_atomic_fetch_add_32 (&module->pending_ops, 1); + } + + pending_op->op_result = (void *) result; + pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; + OBJ_RETAIN(pending_op); + if (cbfunc) { + pending_op->cbfunc = cbfunc; + pending_op->cbdata = cbdata; + pending_op->cbcontext = cbcontext; + } + + /* spin until the btl has accepted the operation */ + do { + if (NULL == pending_op->op_frag) { + ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); + } + + if (NULL != pending_op->op_frag) { + ret = ompi_osc_rdma_btl_atomic_fop(module, btl_index, endpoint, pending_op->op_buffer, + (intptr_t) address, pending_op->op_frag->handle, address_handle, + op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + (void *) pending_op, NULL); + } + + if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { + break; + } + ompi_osc_rdma_progress (module); + } while (1); + + if (OPAL_SUCCESS != ret) { + if (OPAL_LIKELY(1 == ret)) { + *result = ((int64_t *) pending_op->op_buffer)[0]; + ret = OMPI_SUCCESS; + ompi_osc_rdma_atomic_complete (selected_btl, endpoint, pending_op->op_buffer, + pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS); + } else { + /* need to release here because ompi_osc_rdma_atomic_complete was not called */ + OBJ_RELEASE(pending_op); + } + } else if (wait_for_completion) { + while (!pending_op->op_complete) { + ompi_osc_rdma_progress (module); + } + } + + OBJ_RELEASE(pending_op); + + return ret; +} + + +static inline int +ompi_osc_rdma_btl_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, uint64_t address, + mca_btl_base_registration_handle_t *address_handle, + int op, int64_t operand, int flags, const bool wait_for_completion, + ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) +{ + ompi_osc_rdma_pending_op_t *pending_op; + mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); + int ret; + + /* if using the AM RDMA interface with alternate BTLs or if the + accelerated BTL does not support atomic ops, emulate the atomic + op over a fetch and atomic op */ + if (!module->use_accelerated_btl || !(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { + return ompi_osc_rdma_btl_fop (module, btl_index, endpoint, address, address_handle, op, operand, flags, + NULL, wait_for_completion, cbfunc, cbdata, cbcontext); + } + + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + OBJ_RETAIN(pending_op); + if (cbfunc) { + pending_op->cbfunc = cbfunc; + pending_op->cbdata = cbdata; + pending_op->cbcontext = cbcontext; + } + + if (!wait_for_completion) { + /* NTH: need to keep track of pending ops to avoid a potential teardown problem */ + pending_op->module = module; + (void) opal_atomic_fetch_add_32 (&module->pending_ops, 1); + } + + /* spin until the btl has accepted the operation */ + do { + ret = ompi_osc_rdma_btl_atomic_op(module, btl_index, endpoint, (intptr_t) address, address_handle, + op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + (void *) pending_op, NULL); + + if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { + break; + } + ompi_osc_rdma_progress (module); + } while (1); + + if (OPAL_SUCCESS != ret) { + /* need to release here because ompi_osc_rdma_atomic_complete was not called */ + OBJ_RELEASE(pending_op); + if (OPAL_LIKELY(1 == ret)) { + if (cbfunc) { + cbfunc (cbdata, cbcontext, OMPI_SUCCESS); + } + ret = OMPI_SUCCESS; + } + } else if (wait_for_completion) { + while (!pending_op->op_complete) { + ompi_osc_rdma_progress (module); + } + } + + OBJ_RELEASE(pending_op); + + return ret; +} + + +static inline int +ompi_osc_rdma_btl_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, uint64_t address, + mca_btl_base_registration_handle_t *address_handle, + int64_t compare, int64_t value, int flags, int64_t *result) +{ + ompi_osc_rdma_pending_op_t *pending_op; + int ret; + + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + + OBJ_RETAIN(pending_op); + + pending_op->op_result = (void *) result; + pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; + + /* spin until the btl has accepted the operation */ + do { + if (NULL == pending_op->op_frag) { + ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); + } + if (NULL != pending_op->op_frag) { + ret = ompi_osc_rdma_btl_atomic_cswap(module, btl_index, endpoint, pending_op->op_buffer, + address, pending_op->op_frag->handle, address_handle, compare, + value, flags, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, + NULL); + } + + if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { + break; + } + ompi_osc_rdma_progress (module); + } while (1); + + if (OPAL_SUCCESS != ret) { + if (OPAL_LIKELY(1 == ret)) { + *result = ((int64_t *) pending_op->op_buffer)[0]; + ret = OMPI_SUCCESS; + } + + /* need to release here because ompi_osc_rdma_atomic_complete was not called */ + OBJ_RELEASE(pending_op); + } else { + while (!pending_op->op_complete) { + ompi_osc_rdma_progress (module); + } + } + + OBJ_RELEASE(pending_op); + + return ret; +} + +#endif /* OSC_RDMA_BTL_COMM_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 449bbea0641..569cf669cae 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -6,6 +6,8 @@ * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -13,13 +15,18 @@ * $HEADER$ */ +#include "ompi_config.h" + #include "osc_rdma_comm.h" +#include "osc_rdma_frag.h" #include "osc_rdma_sync.h" #include "osc_rdma_request.h" #include "osc_rdma_dynamic.h" +#include "osc_rdma_btl_comm.h" -#include "ompi/mca/osc/base/osc_base_obj_convert.h" #include "opal/align.h" +#include "opal/util/minmax.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" /* helper functions */ static inline void ompi_osc_rdma_cleanup_rdma (ompi_osc_rdma_sync_t *sync, bool dec_always, ompi_osc_rdma_frag_t *frag, @@ -58,8 +65,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde struct mca_btl_base_endpoint_t *endpoint, uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *data, size_t len) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, btl_index); - const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); + const size_t btl_alignment_mask = ALIGNMENT_MASK(module->get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; volatile bool read_complete = false; @@ -75,30 +81,33 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde "), len: %lu (aligned: %lu)", (void *) endpoint, source_address, aligned_addr, (unsigned long) len, (unsigned long) aligned_len); - if (btl->btl_register_mem && len >= btl->btl_get_local_registration_threshold) { - do { - ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret)) { - ompi_osc_rdma_progress (module); + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + if (len >= btl->btl_get_local_registration_threshold) { + do { + ret = ompi_osc_rdma_frag_alloc(module, aligned_len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret)) { + ompi_osc_rdma_progress(module); + } + } while (OMPI_ERR_OUT_OF_RESOURCE == ret); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "error allocating temporary buffer"); + return ret; } - } while (OMPI_ERR_OUT_OF_RESOURCE == ret); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "error allocating temporary buffer"); - return ret; + local_handle = frag->handle; + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocated temporary buffer %p in fragment %p", (void*)ptr, + (void *) frag); } - - local_handle = frag->handle; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocated temporary buffer %p in fragment %p", (void*)ptr, - (void *) frag); } - assert (!(source_address & ALIGNMENT_MASK(btl->btl_get_alignment))); + assert (!(source_address & ALIGNMENT_MASK(module->get_alignment))); do { - ret = btl->btl_get (btl, endpoint, ptr, aligned_addr, - local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, - ompi_osc_get_data_complete, (void *) &read_complete, NULL); + ret = ompi_osc_rdma_btl_get(module, btl_index, endpoint, ptr, aligned_addr, + local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, + ompi_osc_get_data_complete, (void *) &read_complete, NULL); if (!ompi_osc_rdma_oor (ret)) { break; } @@ -242,7 +251,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc assert (0 != local_iov_count); /* determine how much to transfer in this operation */ - rdma_len = min(min(local_iovec[local_iov_index].iov_len, remote_iovec[remote_iov_index].iov_len), max_rdma_len); + rdma_len = opal_min(opal_min(local_iovec[local_iov_index].iov_len, remote_iovec[remote_iov_index].iov_len), max_rdma_len); /* execute the get */ if (!subreq && alloc_reqs) { @@ -389,7 +398,7 @@ static void ompi_osc_rdma_put_complete (struct mca_btl_base_module_t *btl, struc /* the lowest bit is used as a flag indicating this put operation has a request */ if ((intptr_t) context & 0x1) { - ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); + ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); sync = request->sync; if (0 == OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1)) { @@ -420,7 +429,7 @@ static void ompi_osc_rdma_put_complete_flush (struct mca_btl_base_module_t *btl, /* the lowest bit is used as a flag indicating this put operation has a request */ if ((intptr_t) context & 0x1) { - ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); + ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); module = request->module; if (0 == OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1)) { @@ -444,7 +453,6 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee mca_btl_base_registration_handle_t *local_handle, size_t size, mca_btl_base_rdma_completion_fn_t cb, void *context, void *cbdata) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating btl put of %lu bytes to remote address %" PRIx64 ", sync " @@ -454,9 +462,9 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee ompi_osc_rdma_sync_rdma_inc (sync); do { - ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address, - local_handle, target_handle, size, 0, MCA_BTL_NO_ORDER, - cb, context, cbdata); + ret = ompi_osc_rdma_btl_put(module, peer->data_btl_index, peer->data_endpoint, + ptr, target_address, local_handle, target_handle, + size, 0, MCA_BTL_NO_ORDER, cb, context, cbdata); if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { return OMPI_SUCCESS; } @@ -481,7 +489,6 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t * ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *local_handle = NULL; mca_btl_base_rdma_completion_fn_t cbfunc = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -489,16 +496,19 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t * void *cbcontext; int ret; - if (btl->btl_register_mem && size > btl->btl_put_local_registration_threshold) { - ret = ompi_osc_rdma_frag_alloc (module, size, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ret = ompi_osc_rdma_register (module, peer->data_endpoint, source_buffer, size, 0, &local_handle); + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, peer->data_btl_index); + if (size > btl->btl_put_local_registration_threshold) { + ret = ompi_osc_rdma_frag_alloc(module, size, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; + ret = ompi_osc_rdma_register(module, peer->data_endpoint, source_buffer, size, 0, &local_handle); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } else { + memcpy(ptr, source_buffer, size); + local_handle = frag->handle; } - } else { - memcpy (ptr, source_buffer, size); - local_handle = frag->handle; } } @@ -600,8 +610,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); + const size_t btl_alignment_mask = ALIGNMENT_MASK(module->get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; osc_rdma_size_t aligned_len; @@ -617,70 +626,73 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating get of %lu bytes from remote ptr %" PRIx64 " to local ptr %p", size, source_address, target_buffer); - if ((btl->btl_register_mem && size > btl->btl_get_local_registration_threshold) || - (((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, peer->data_btl_index); + if (size > btl->btl_get_local_registration_threshold || + (((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { - /* region is too large for a buffered read */ - size_t subsize; - - if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) { - /* remote region has the same alignment but the base is not aligned. perform a small - * buffered get of the beginning of the remote region */ - aligned_source_base = OPAL_ALIGN(source_address, btl->btl_get_alignment, osc_rdma_base_t); - subsize = (size_t) (aligned_source_base - source_address); - - ret = ompi_osc_rdma_get_partial (sync, peer, source_address, source_handle, target_buffer, subsize, request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; + ret = ompi_osc_rdma_frag_alloc(module, aligned_len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { + /* region is too large for a buffered read */ + size_t subsize; + + if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) { + /* remote region has the same alignment but the base is not aligned. perform a small + * buffered get of the beginning of the remote region */ + aligned_source_base = OPAL_ALIGN(source_address, btl->btl_get_alignment, osc_rdma_base_t); + subsize = (size_t) (aligned_source_base - source_address); + + ret = ompi_osc_rdma_get_partial(sync, peer, source_address, source_handle, target_buffer, subsize, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + source_address += subsize; + target_buffer = (void *) ((intptr_t) target_buffer + subsize); + size -= subsize; + + aligned_len = aligned_source_bound - aligned_source_base; } - source_address += subsize; - target_buffer = (void *) ((intptr_t) target_buffer + subsize); - size -= subsize; + if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) && + (size & btl_alignment_mask)) { + /* remote region bases are aligned but the bounds are not. perform a + * small buffered get of the end of the remote region */ + aligned_len = size & ~btl_alignment_mask; + subsize = size - aligned_len; + size = aligned_len; + ret = ompi_osc_rdma_get_partial(sync, peer, source_address + aligned_len, source_handle, + (void *) ((intptr_t) target_buffer + aligned_len), subsize, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } + /* (remaining) user request is now correctly aligned */ + } - aligned_len = aligned_source_bound - aligned_source_base; + if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { + /* local and remote alignments differ */ + request->buffer = ptr = malloc(aligned_len); + } else { + ptr = target_buffer; } - if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) && - (size & btl_alignment_mask)) { - /* remote region bases are aligned but the bounds are not. perform a - * small buffered get of the end of the remote region */ - aligned_len = size & ~btl_alignment_mask; - subsize = size - aligned_len; - size = aligned_len; - ret = ompi_osc_rdma_get_partial (sync, peer, source_address + aligned_len, source_handle, - (void *) ((intptr_t) target_buffer + aligned_len), subsize, request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } + if (NULL != ptr) { + (void)ompi_osc_rdma_register(module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE, + &local_handle); } - /* (remaining) user request is now correctly aligned */ - } - if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - /* local and remote alignments differ */ - request->buffer = ptr = malloc (aligned_len); + if (OPAL_UNLIKELY(NULL == local_handle)) { + free(request->buffer); + request->buffer = NULL; + return ret; + } } else { - ptr = target_buffer; - } - - if (NULL != ptr) { - (void) ompi_osc_rdma_register (module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE, - &local_handle); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx", + (void*)ptr, (void *) frag, (unsigned long) aligned_len, (unsigned long) aligned_source_base); + local_handle = frag->handle; } - - if (OPAL_UNLIKELY(NULL == local_handle)) { - free (request->buffer); - request->buffer = NULL; - return ret; - } - } else { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx", - (void*)ptr, (void *) frag, (unsigned long) aligned_len, (unsigned long) aligned_source_base); - local_handle = frag->handle; } } @@ -703,10 +715,10 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p } do { - ret = btl->btl_get (btl, peer->data_endpoint, ptr, - aligned_source_base, local_handle, source_handle, - aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete, - request, frag); + ret = ompi_osc_rdma_btl_get(module, peer->data_btl_index, peer->data_endpoint, + ptr, aligned_source_base, local_handle, source_handle, + aligned_len, 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_get_complete, request, frag); if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { return OMPI_SUCCESS; } @@ -736,7 +748,6 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *target_handle; uint64_t target_address; int ret; @@ -771,7 +782,7 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi return ompi_osc_rdma_master (sync, (void *) origin_addr, origin_count, origin_datatype, peer, target_address, target_handle, target_count, target_datatype, request, - btl->btl_put_limit, ompi_osc_rdma_put_contig, false); + module->put_limit, ompi_osc_rdma_put_contig, false); } static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, @@ -779,7 +790,6 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *source_handle; uint64_t source_address; ptrdiff_t source_span, source_lb; @@ -812,7 +822,7 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori return ompi_osc_rdma_master (sync, origin_addr, origin_count, origin_datatype, peer, source_address, source_handle, source_count, source_datatype, request, - btl->btl_get_limit, ompi_osc_rdma_get_contig, true); + module->get_limit, ompi_osc_rdma_get_contig, true); } int ompi_osc_rdma_put (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, ptrdiff_t target_disp, int target_count, diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.h b/ompi/mca/osc/rdma/osc_rdma_comm.h index efb305a571e..82c1e873263 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_comm.h @@ -4,6 +4,8 @@ * reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,11 +19,9 @@ #include "osc_rdma_dynamic.h" #include "osc_rdma_request.h" #include "osc_rdma_sync.h" -#include "osc_rdma_lock.h" #define OMPI_OSC_RDMA_DECODE_MAX 64 -#define min(a,b) ((a) < (b) ? (a) : (b)) #define ALIGNMENT_MASK(x) ((x) ? (x) - 1 : 0) /** diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index cdc71ad3056..5b3a6df2bc8 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -35,6 +35,7 @@ #include "ompi_config.h" #include +#include #include "osc_rdma.h" #include "osc_rdma_frag.h" @@ -50,6 +51,7 @@ #include "opal/util/argv.h" #include "opal/util/printf.h" #include "opal/util/sys_limits.h" +#include "opal/util/minmax.h" #if OPAL_CUDA_SUPPORT #include "opal/mca/common/cuda/common_cuda.h" #endif /* OPAL_CUDA_SUPPORT */ @@ -84,7 +86,6 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, const char *value); static char *ompi_osc_rdma_full_connectivity_btls; -static char *ompi_osc_rdma_btl_alternate_names; static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = { {.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"}, @@ -230,7 +231,7 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach); free(description_str); - mca_osc_rdma_component.priority = 101; + mca_osc_rdma_component.priority = 20; opal_asprintf(&description_str, "Priority of the osc/rdma component (default: %d)", mca_osc_rdma_component.priority); (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "priority", description_str, @@ -257,14 +258,6 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_full_connectivity_btls); free(description_str); - ompi_osc_rdma_btl_alternate_names = "sm,tcp"; - opal_asprintf(&description_str, "Comma-delimited list of alternate BTL component names to allow without verifying " - "connectivity (default: %s)", ompi_osc_rdma_btl_alternate_names); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "alternate_btls", description_str, - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_alternate_names); - free(description_str); - if (0 == access ("/dev/shm", W_OK)) { mca_osc_rdma_component.backing_directory = "/dev/shm"; } else { @@ -395,15 +388,14 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s } #endif /* OPAL_CUDA_SUPPORT */ - if (OMPI_SUCCESS == ompi_osc_rdma_query_accelerated_btls (comm, NULL)) { - return mca_osc_rdma_component.priority; - } - - if (OMPI_SUCCESS == ompi_osc_rdma_query_alternate_btls (comm, NULL)) { - return mca_osc_rdma_component.priority; + /* verify if we have any btls available. Since we do not verify + * connectivity across all btls in the alternate case, this is as + * good a test as we are going to have for success. */ + if (opal_list_is_empty(&mca_btl_base_modules_initialized)) { + return -1; } - return -1; + return OMPI_SUCCESS;; } static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void **base, size_t size) { @@ -419,6 +411,7 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void region->len = size; if (module->use_memory_registration && size) { + assert(module->use_accelerated_btl); if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor || NULL == module->state_handle) { ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, *base, size, MCA_BTL_REG_FLAG_ACCESS_ANY, &module->base_handle); @@ -426,23 +419,24 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void return OMPI_ERR_OUT_OF_RESOURCE; } - memcpy (region->btl_handle_data, module->base_handle, module->selected_btls[0]->btl_registration_handle_size); + memcpy (region->btl_handle_data, module->base_handle, module->accelerated_btl->btl_registration_handle_size); } else { - memcpy (region->btl_handle_data, module->state_handle, module->selected_btls[0]->btl_registration_handle_size); + memcpy (region->btl_handle_data, module->state_handle, module->accelerated_btl->btl_registration_handle_size); } } return OMPI_SUCCESS; } -static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size) +static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size, bool use_cpu_atomics) { size_t total_size, local_rank_array_size, leader_peer_data_size, base_data_size; ompi_osc_rdma_peer_t *my_peer; int ret, my_rank; size_t memory_alignment = module->memory_alignment; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating private internal state"); + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "allocating private internal state"); my_rank = ompi_comm_rank (module->comm); @@ -514,7 +508,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; my_peer->state = (uint64_t) (uintptr_t) module->state; - if (module->use_cpu_atomics) { + if (use_cpu_atomics) { /* all peers are local or it is safe to mix cpu and nic atomics */ my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE; } else { @@ -533,7 +527,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = size; } - if (!module->use_cpu_atomics) { + if (!use_cpu_atomics) { if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { /* base is local and cpu atomics are available */ ex_peer->super.base_handle = module->state_handle; @@ -577,6 +571,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s struct _local_data *temp; char *data_file; size_t memory_alignment = module->memory_alignment; + bool use_cpu_atomics; shared_comm = module->shared_comm; @@ -585,20 +580,30 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s /* CPU atomics can be used if every process is on the same node or the NIC allows mixing CPU and NIC atomics */ module->single_node = local_size == global_size; - module->use_cpu_atomics = module->single_node; - if (!module->single_node) { - for (int i = 0 ; i < module->btls_in_use ; ++i) { - module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); - } + if (module->single_node) { + use_cpu_atomics = true; + } else if (module->use_accelerated_btl) { + use_cpu_atomics = !!(module->accelerated_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + } else { + /* using the shared state optimization that is enabled by + * being able to use cpu atomics was never enabled for + * alternate btls, due to a previous bug in the enablement + * logic when alternate btls were first supported. It is + * likely that this optimization could work with sufficient + * testing, but for now, always disable to not introduce new + * correctness risks. + */ + use_cpu_atomics = false; } if (1 == local_size) { /* no point using a shared segment if there are no other processes on this node */ - return allocate_state_single (module, base, size); + return allocate_state_single (module, base, size, use_cpu_atomics); } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating shared internal state"); + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "allocating shared internal state"); local_rank_array_size = sizeof (ompi_osc_rdma_rank_data_t) * RANK_ARRAY_COUNT (module); leader_peer_data_size = module->region_size * module->node_count; @@ -654,7 +659,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ret = opal_shmem_segment_create (&module->seg_ds, data_file, total_size); free (data_file); if (OPAL_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create shared memory segment"); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to create shared memory segment"); } } } @@ -672,7 +678,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s module->segment_base = opal_shmem_segment_attach (&module->seg_ds); if (NULL == module->segment_base) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to attach to the shared memory segment"); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to attach to the shared memory segment"); ret = OPAL_ERROR; } @@ -708,14 +715,16 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s if (0 == local_rank) { /* unlink the shared memory backing file */ opal_shmem_unlink (&module->seg_ds); - /* just go ahead and register the whole segment */ - ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, - MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle); - if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { - state_region->base = (intptr_t) module->segment_base; - if (module->state_handle) { - memcpy (state_region->btl_handle_data, module->state_handle, - module->selected_btls[0]->btl_registration_handle_size); + if (module->use_accelerated_btl) { + /* just go ahead and register the whole segment */ + ret = ompi_osc_rdma_register(module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, + MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle); + if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { + state_region->base = (intptr_t) module->segment_base; + if (module->state_handle) { + memcpy(state_region->btl_handle_data, module->state_handle, + module->accelerated_btl->btl_registration_handle_size); + } } } } @@ -735,8 +744,9 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s region->base = state_region->base + my_base_offset; region->len = size; if (module->use_memory_registration) { - memcpy (region->btl_handle_data, state_region->btl_handle_data, - module->selected_btls[0]->btl_registration_handle_size); + assert(module->use_accelerated_btl); + memcpy(region->btl_handle_data, state_region->btl_handle_data, + module->accelerated_btl->btl_registration_handle_size); } } @@ -768,7 +778,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer = (ompi_osc_rdma_peer_extended_t *) peer; /* set up peer state */ - if (module->use_cpu_atomics) { + if (use_cpu_atomics) { /* all peers are local or it is safe to mix cpu and nic atomics */ peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE; peer->state = (osc_rdma_counter_t) peer_state; @@ -793,7 +803,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s } if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor && MPI_WIN_FLAVOR_CREATE != module->flavor && - !module->use_cpu_atomics && temp[i].size && i > 0) { + !use_cpu_atomics && temp[i].size && i > 0) { /* use the local leader's endpoint */ peer->data_endpoint = local_leader->data_endpoint; peer->data_btl_index = local_leader->data_btl_index; @@ -802,7 +812,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ompi_osc_module_add_peer (module, peer); if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) { - if (module->use_cpu_atomics && peer_rank == my_rank) { + if (use_cpu_atomics && peer_rank == my_rank) { peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; } /* nothing more to do */ @@ -818,7 +828,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = temp[i].size; } - if (module->use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) { + if (use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) { /* base is local and cpu atomics are available */ if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { ex_peer->super.base = (uintptr_t) module->segment_base + offset; @@ -871,72 +881,133 @@ static void ompi_osc_rdma_ensure_local_add_procs (void) free(procs); } + +/* + * qsort() sorting function for ompi_osc_rdma_query_alternate_btls(), + * using latency as the sorting metric. + */ +static int btl_latency_sort_fn(const void *a, const void *b) +{ + const mca_btl_base_am_rdma_module_t * const *am_rdma_a_p = a; + const mca_btl_base_am_rdma_module_t * const *am_rdma_b_p = b; + const mca_btl_base_am_rdma_module_t *am_rdma_a = *am_rdma_a_p; + const mca_btl_base_am_rdma_module_t *am_rdma_b = *am_rdma_b_p; + + if (am_rdma_a->btl->btl_latency < am_rdma_b->btl->btl_latency) { + return -1; + } else if (am_rdma_a->btl->btl_latency == am_rdma_b->btl->btl_latency) { + return 0; + } else { + return 1; + } +} + + /** * @brief query for alternate BTLs * * @in comm Communicator to query - * @out module OSC module to store BTLs/count to (optional) - * @out + * @inout module OSC module to store BTLs/count to (optional) * * @return OMPI_SUCCESS if BTLs can be found * @return OMPI_ERR_UNREACH if no BTLs can be found that match * - * In this case an "alternate" BTL is a BTL does not meet the - * requirements of a BTL outlined in ompi_osc_rdma_query_accelerated_btls(). - * Either it does not provide connectivity to all peers, provide - * remote completion, or natively support put/get/atomic.. Since more - * than one BTL may be needed for this support the OSC component will - * disable the use of registration-based RDMA (these BTLs will not be - * used) and will use any remaining BTL. By default the BTLs used will - * be tcp and sm but any single (or pair) of BTLs may be used. + * We directly use the active message rdma wrappers for alternate + * BTLs, in all cases. This greatly simplifies the alternate BTL + * impementation, at the expense of some performance. With the + * AM wrappers, we can always enforce remote completion and the lack + * of memory registration, at some performance cost. But we can use + * as many BTLs as we like. The module's btl list is sorted by + * latency, so that ompi_osc_rdma_peer_btl_endpoint() picks the lowest + * available latency btl to communicate with the peer. Unlike the OB1 + * PML, we only use one BTL per peer. + * + * Like the OB1 PML, there is no verification that there is at least + * one BTL that can communicate with every other peer in the window. */ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) { + size_t btl_count; + size_t index = 0; mca_btl_base_selected_module_t *item; - char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); - int btls_found = 0; - - btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); - if (NULL == btls_to_use) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names); - return OMPI_ERR_UNREACH; - } - - if (module) { - module->btls_in_use = 0; - } - - /* rdma and atomics are only supported with BTLs at the moment */ - for (int i = 0 ; btls_to_use[i] ; ++i) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking for btl %s", btls_to_use[i]); - OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { - if (NULL != item->btl_module->btl_register_mem) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping RDMA btl when searching for alternate BTL"); - continue; - } - - if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping btl %s", - item->btl_module->btl_component->btl_version.mca_component_name); - continue; - } - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "found alternate btl %s", btls_to_use[i]); + int ret; - ++btls_found; - if (module) { - mca_btl_base_am_rdma_init(item->btl_module); - ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); - } - + assert(NULL != module); + + module->put_alignment = 1; + module->get_alignment = 1; + module->put_limit = SIZE_MAX; + module->get_limit = SIZE_MAX; + + btl_count = opal_list_get_size(&mca_btl_base_modules_initialized); + if (btl_count > UINT8_MAX) { + return OMPI_ERROR; + } + + module->alternate_btl_count = btl_count; + module->alternate_am_rdmas = malloc(sizeof(struct mca_btl_base_am_rdma_module_t *) * module->alternate_btl_count); + if (NULL == module->alternate_am_rdmas) { + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + + /* add all alternate btls to the selected_btls list, not worrying + about ordering yet. We have to add all btls unless we want to + iterate over all endpoints to build the minimum set of btls + needed to communicate with all peers. An MCA parameter just + for osc rdma also wouldn't work, as the BML can decide not to + add an endpoint for a btl given the priority of another btl. + For example, it is not uncommon that the only endpoint created + to a peer on the same host is the sm btl's endpoint. If we + had an osc rdma specific parameter list, and the user + specified a combination not including sm, that would result in + an eventual failure, as no btl would be found to talk to ranks + on the same host.*/ + OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "found alternate btl %s", + item->btl_module->btl_component->btl_version.mca_component_name); + + ret = opal_btl_base_am_rdma_create(item->btl_module, + MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION, + true /* no_memory_registration */, + &(module->alternate_am_rdmas[index])); + if (OMPI_SUCCESS != ret) { + return ret; } - } - opal_argv_free (btls_to_use); + module->put_alignment = opal_max(module->put_alignment, + module->alternate_am_rdmas[index]->am_btl_put_alignment); + module->get_alignment = opal_max(module->get_alignment, + module->alternate_am_rdmas[index]->am_btl_get_alignment); + module->put_limit = opal_min(module->put_limit, + module->alternate_am_rdmas[index]->am_btl_put_limit); + module->get_limit = opal_min(module->get_limit, + module->alternate_am_rdmas[index]->am_btl_get_limit); + + index++; + } + assert(index == module->alternate_btl_count); + + /* sort based on latency, lowest first */ + qsort(module->alternate_am_rdmas, module->alternate_btl_count, + sizeof(module->alternate_am_rdmas[0]), btl_latency_sort_fn); + + module->use_memory_registration = false; + module->atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | + MCA_BTL_ATOMIC_SUPPORTS_AND | + MCA_BTL_ATOMIC_SUPPORTS_OR | + MCA_BTL_ATOMIC_SUPPORTS_XOR | + MCA_BTL_ATOMIC_SUPPORTS_SWAP | + MCA_BTL_ATOMIC_SUPPORTS_MIN | + MCA_BTL_ATOMIC_SUPPORTS_MAX | + MCA_BTL_ATOMIC_SUPPORTS_32BIT | + MCA_BTL_ATOMIC_SUPPORTS_CSWAP | + MCA_BTL_ATOMIC_SUPPORTS_GLOB; - return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; + return OMPI_SUCCESS; } + /* Check for BTL requirements: * 1) RDMA (put/get) and ATOMIC operations. We only require cswap * and fetch and add and will emulate other opterations with those @@ -967,9 +1038,6 @@ static bool ompi_osc_rdma_check_accelerated_btl(struct mca_btl_base_module_t *bt * Testing (1) is expensive, so as an optimization, the * ompi_osc_rdma_full_connectivity_btls list contains the list of BTL * components we know can achieve (1) in almost all usage scenarios. - * - * If module is NULL, the code acts as a query mechanism to find any - * potential BTLs, and is used to implement osc_rdma_query(). */ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) { @@ -978,11 +1046,10 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi mca_bml_base_endpoint_t *base_endpoint; char **btls_to_use; - if (module) { - ompi_osc_rdma_selected_btl_insert(module, NULL, 0); - module->btls_in_use = 0; - module->use_memory_registration = false; - } + assert(NULL != module); + + module->use_accelerated_btl = false; + module->use_memory_registration = false; /* Check for BTLs in the list of BTLs we know can reach all peers in general usage. */ @@ -1089,19 +1156,24 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi } if (NULL == selected_btl) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no suitable btls found"); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "accelerated_query: no suitable btls found"); return OMPI_ERR_NOT_AVAILABLE; } btl_selection_complete: - if (module) { - ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0); - module->btls_in_use = 1; - module->use_memory_registration = selected_btl->btl_register_mem != NULL; - } - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "selected btl: %s", - selected_btl->btl_component->btl_version.mca_component_name); + module->use_accelerated_btl = true; + module->accelerated_btl = selected_btl; + module->use_memory_registration = (selected_btl->btl_register_mem != NULL); + module->put_alignment = selected_btl->btl_put_alignment; + module->get_alignment = selected_btl->btl_get_alignment; + module->put_limit = selected_btl->btl_put_limit; + module->get_limit = selected_btl->btl_get_limit; + module->atomic_flags = selected_btl->btl_atomic_flags; + + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "accelerated_query: selected btl: %s", + selected_btl->btl_component->btl_version.mca_component_name); return OMPI_SUCCESS; } @@ -1141,7 +1213,8 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module) my_data->len = (osc_rdma_size_t) my_rank; if (module->use_memory_registration && module->state_handle) { - memcpy (my_data->btl_handle_data, module->state_handle, module->selected_btls[0]->btl_registration_handle_size); + assert(module->use_accelerated_btl); + memcpy (my_data->btl_handle_data, module->state_handle, module->accelerated_btl->btl_registration_handle_size); } /* gather state data at each node leader */ @@ -1150,7 +1223,8 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module) module->region_size, MPI_BYTE, module->local_leaders, module->local_leaders->c_coll->coll_allgather_module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "leader allgather failed with ompi error code %d", ret); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "leader allgather failed with ompi error code %d", ret); break; } } @@ -1193,7 +1267,8 @@ static int ompi_osc_rdma_create_groups (ompi_osc_rdma_module_t *module) /* create a shared communicator to handle communication about the local segment */ ret = ompi_comm_split_type (module->comm, MPI_COMM_TYPE_SHARED, 0, NULL, &module->shared_comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create a shared memory communicator. error code %d", ret); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to create a shared memory communicator. error code %d", ret); return ret; } @@ -1204,7 +1279,8 @@ static int ompi_osc_rdma_create_groups (ompi_osc_rdma_module_t *module) ret = ompi_comm_split (module->comm, (0 == local_rank) ? 0 : MPI_UNDEFINED, comm_rank, &module->local_leaders, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create local leaders communicator. error code %d", ret); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to create local leaders communicator. error code %d", ret); return ret; } @@ -1217,7 +1293,8 @@ static int ompi_osc_rdma_create_groups (ompi_osc_rdma_module_t *module) ret = module->shared_comm->c_coll->coll_bcast (values, 2, MPI_INT, 0, module->shared_comm, module->shared_comm->c_coll->coll_bcast_module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to broadcast local data. error code %d", ret); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to broadcast local data. error code %d", ret); return ret; } } @@ -1311,9 +1388,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, module->acc_use_amo = mca_osc_rdma_component.acc_use_amo; module->network_amo_max_count = mca_osc_rdma_component.network_amo_max_count; - module->selected_btls_size = MCA_OSC_RDMA_BTLS_SIZE_INIT; - module->selected_btls = calloc(module->selected_btls_size, sizeof(struct mca_btl_base_module_t *)); - module->all_sync.module = module; module->flavor = flavor; @@ -1350,8 +1424,9 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, return ret; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "creating osc/rdma window of flavor %d with id %s", - flavor, ompi_comm_print_cid (module->comm)); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "creating osc/rdma window of flavor %d with id %s", + flavor, ompi_comm_print_cid (module->comm)); /* peer data */ if (world_size > init_limit) { @@ -1370,13 +1445,16 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, } /* find rdma capable endpoints */ + module->use_accelerated_btl = false; ret = ompi_osc_rdma_query_accelerated_btls (module->comm, module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_WARN, "could not find a suitable btl. falling back on " - "active-message BTLs"); + opal_output_verbose(MCA_BASE_VERBOSE_WARN, ompi_osc_base_framework.framework_output, + "could not find an accelerated btl. falling back on " + "active-message BTLs"); ret = ompi_osc_rdma_query_alternate_btls (module->comm, module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_WARN, "no BTL available for RMA window"); + opal_output_verbose(MCA_BASE_VERBOSE_WARN, ompi_osc_base_framework.framework_output, + "no BTL available for RMA window"); ompi_osc_rdma_free (win); return ret; } @@ -1386,7 +1464,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, module->region_size = sizeof (ompi_osc_rdma_region_t); if (module->use_memory_registration) { - module->region_size += module->selected_btls[0]->btl_registration_handle_size; + assert(module->use_accelerated_btl); + module->region_size += module->accelerated_btl->btl_registration_handle_size; } module->state_size = sizeof (ompi_osc_rdma_state_t); @@ -1428,7 +1507,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, /* notify all others if something went wrong */ ret = synchronize_errorcode(ret, module->comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to allocate internal state"); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to allocate internal state"); ompi_osc_rdma_free (win); return ret; } @@ -1479,14 +1559,16 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, ret = ompi_osc_rdma_share_data (module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to share window data with peers"); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to share window data with peers"); ompi_osc_rdma_free (win); } else { /* for now the leader is always rank 0 in the communicator */ module->leader = ompi_osc_rdma_module_peer (module, 0); - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "finished creating osc/rdma window with id %s", - ompi_comm_print_cid(module->comm)); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "finished creating osc/rdma window with id %s", + ompi_comm_print_cid(module->comm)); } return ret; diff --git a/ompi/mca/osc/rdma/osc_rdma_dynamic.c b/ompi/mca/osc/rdma/osc_rdma_dynamic.c index 8adfa7f8159..61e14fea56c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_dynamic.c +++ b/ompi/mca/osc/rdma/osc_rdma_dynamic.c @@ -252,7 +252,8 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len) return OMPI_ERR_RMA_ATTACH; } - memcpy (region->btl_handle_data, handle, module->selected_btls[0]->btl_registration_handle_size); + assert(module->use_accelerated_btl); + memcpy(region->btl_handle_data, handle, module->accelerated_btl->btl_registration_handle_size); rdma_region_handle->btl_handle = handle; } else { rdma_region_handle->btl_handle = NULL; diff --git a/ompi/mca/osc/rdma/osc_rdma_lock.h b/ompi/mca/osc/rdma/osc_rdma_lock.h index 36a30a1cc0b..19a3249bdbe 100644 --- a/ompi/mca/osc/rdma/osc_rdma_lock.h +++ b/ompi/mca/osc/rdma/osc_rdma_lock.h @@ -5,6 +5,8 @@ * Copyright (c) 2019 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2021 Google, LLC. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,6 +19,7 @@ #include "osc_rdma_types.h" #include "osc_rdma_frag.h" +#include "osc_rdma_btl_comm.h" static inline int ompi_osc_rdma_trylock_local (ompi_osc_rdma_atomic_lock_t *lock) { @@ -29,82 +32,6 @@ static inline void ompi_osc_rdma_unlock_local (ompi_osc_rdma_atomic_lock_t *lock (void) ompi_osc_rdma_lock_add (lock, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE); } -/** - * Dummy completion function for atomic operations - */ -void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - void *context, void *data, int status); - -__opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t btl_index, - struct mca_btl_base_endpoint_t *endpoint, uint64_t address, - mca_btl_base_registration_handle_t *address_handle, int op, - int64_t operand, int flags, int64_t *result, const bool wait_for_completion, - ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) -{ - ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); - int ret = OPAL_ERROR; - - pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); - assert (NULL != pending_op); - - if (!wait_for_completion) { - /* NTH: need to keep track of pending ops to avoid a potential teardown problem */ - pending_op->module = module; - (void) opal_atomic_fetch_add_32 (&module->pending_ops, 1); - } - - pending_op->op_result = (void *) result; - pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; - OBJ_RETAIN(pending_op); - if (cbfunc) { - pending_op->cbfunc = cbfunc; - pending_op->cbdata = cbdata; - pending_op->cbcontext = cbcontext; - } - - /* spin until the btl has accepted the operation */ - do { - if (NULL == pending_op->op_frag) { - ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); - } - - if (NULL != pending_op->op_frag) { - ret = selected_btl->btl_atomic_fop (selected_btl, endpoint, pending_op->op_buffer, - (intptr_t) address, pending_op->op_frag->handle, address_handle, - op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) pending_op, NULL); - } - - if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { - break; - } - ompi_osc_rdma_progress (module); - } while (1); - - if (OPAL_SUCCESS != ret) { - if (OPAL_LIKELY(1 == ret)) { - *result = ((int64_t *) pending_op->op_buffer)[0]; - ret = OMPI_SUCCESS; - ompi_osc_rdma_atomic_complete (selected_btl, endpoint, pending_op->op_buffer, - pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS); - } else { - /* need to release here because ompi_osc_rdma_atomic_complete was not called */ - OBJ_RELEASE(pending_op); - } - } else if (wait_for_completion) { - while (!pending_op->op_complete) { - ompi_osc_rdma_progress (module); - } - } - - OBJ_RELEASE(pending_op); - - return ret; -} - __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result, @@ -114,69 +41,6 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om operand, 0, result, wait_for_completion, NULL, NULL, NULL); } -__opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, uint8_t btl_index, - struct mca_btl_base_endpoint_t *endpoint, uint64_t address, - mca_btl_base_registration_handle_t *address_handle, - int op, int64_t operand, int flags, const bool wait_for_completion, - ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) -{ - ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); - int ret; - - if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { - return ompi_osc_rdma_btl_fop (module, btl_index, endpoint, address, address_handle, op, operand, flags, - NULL, wait_for_completion, cbfunc, cbdata, cbcontext); - } - - pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); - assert (NULL != pending_op); - OBJ_RETAIN(pending_op); - if (cbfunc) { - pending_op->cbfunc = cbfunc; - pending_op->cbdata = cbdata; - pending_op->cbcontext = cbcontext; - } - - if (!wait_for_completion) { - /* NTH: need to keep track of pending ops to avoid a potential teardown problem */ - pending_op->module = module; - (void) opal_atomic_fetch_add_32 (&module->pending_ops, 1); - } - - /* spin until the btl has accepted the operation */ - do { - ret = selected_btl->btl_atomic_op (selected_btl, endpoint, (intptr_t) address, address_handle, - op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) pending_op, NULL); - - if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { - break; - } - ompi_osc_rdma_progress (module); - } while (1); - - if (OPAL_SUCCESS != ret) { - /* need to release here because ompi_osc_rdma_atomic_complete was not called */ - OBJ_RELEASE(pending_op); - if (OPAL_LIKELY(1 == ret)) { - if (cbfunc) { - cbfunc (cbdata, cbcontext, OMPI_SUCCESS); - } - ret = OMPI_SUCCESS; - } - } else if (wait_for_completion) { - while (!pending_op->op_complete) { - ompi_osc_rdma_progress (module); - } - } - - OBJ_RELEASE(pending_op); - - return ret; -} - __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, int op, ompi_osc_rdma_lock_t operand, const bool wait_for_completion) @@ -185,61 +49,6 @@ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, omp operand, 0, wait_for_completion, NULL, NULL, NULL); } -__opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, uint8_t btl_index, - struct mca_btl_base_endpoint_t *endpoint, uint64_t address, - mca_btl_base_registration_handle_t *address_handle, - int64_t compare, int64_t value, int flags, int64_t *result) -{ - ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); - int ret; - - pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); - assert (NULL != pending_op); - - OBJ_RETAIN(pending_op); - - pending_op->op_result = (void *) result; - pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; - - /* spin until the btl has accepted the operation */ - do { - if (NULL == pending_op->op_frag) { - ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); - } - if (NULL != pending_op->op_frag) { - ret = selected_btl->btl_atomic_cswap (selected_btl, endpoint, pending_op->op_buffer, - address, pending_op->op_frag->handle, address_handle, compare, - value, flags, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, - NULL); - } - - if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { - break; - } - ompi_osc_rdma_progress (module); - } while (1); - - if (OPAL_SUCCESS != ret) { - if (OPAL_LIKELY(1 == ret)) { - *result = ((int64_t *) pending_op->op_buffer)[0]; - ret = OMPI_SUCCESS; - } - - /* need to release here because ompi_osc_rdma_atomic_complete was not called */ - OBJ_RELEASE(pending_op); - } else { - while (!pending_op->op_complete) { - ompi_osc_rdma_progress (module); - } - } - - OBJ_RELEASE(pending_op); - - return ret; -} - __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_cswap (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, ompi_osc_rdma_lock_t compare, ompi_osc_rdma_lock_t value, ompi_osc_rdma_lock_t *result) diff --git a/ompi/mca/osc/rdma/osc_rdma_module.c b/ompi/mca/osc/rdma/osc_rdma_module.c index 933baf00694..8a080e1e4eb 100644 --- a/ompi/mca/osc/rdma/osc_rdma_module.c +++ b/ompi/mca/osc/rdma/osc_rdma_module.c @@ -144,7 +144,12 @@ int ompi_osc_rdma_free(ompi_win_t *win) free (module->outstanding_lock_array); mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module, module->free_after); - free (module->selected_btls); + if (!module->use_accelerated_btl) { + for (int i = 0 ; i < module->alternate_btl_count ; ++i) { + OBJ_RELEASE(module->alternate_am_rdmas[i]); + } + free(module->alternate_am_rdmas); + } free (module); return OMPI_SUCCESS; diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index c6689d78812..6286fedeb69 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -40,43 +40,74 @@ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *modul struct mca_btl_base_endpoint_t **endpoint) { ompi_proc_t *proc = ompi_comm_peer_lookup (module->comm, peer_id); - mca_bml_base_endpoint_t *bml_endpoint; - int num_btls; - - /* for now just use the bml to get the btl endpoint */ - bml_endpoint = mca_bml_base_get_endpoint (proc); - - num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma); - - for (int module_btl_index = 0 ; module_btl_index < module->btls_in_use ; ++module_btl_index) { - for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { - if (bml_endpoint->btl_rdma.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]) { - *btl_index_out = module_btl_index; - *endpoint = bml_endpoint->btl_rdma.bml_btls[btl_index].btl_endpoint; - return OMPI_SUCCESS; - } + mca_bml_base_endpoint_t *bml_endpoint = mca_bml_base_get_endpoint(proc); + + if (module->use_accelerated_btl) { + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "rank %d: accelerated btl search for peer %d", + ompi_comm_rank(module->comm), peer_id); + mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, + module->accelerated_btl); + if (NULL != bml_btl) { + *btl_index_out = 0; + *endpoint = bml_btl->btl_endpoint; + + return OMPI_SUCCESS; } - } + } else { + mca_bml_base_btl_t *bml_btl; + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "rank %d: alternate btl search for peer %d", + ompi_comm_rank(module->comm), peer_id); + + /* the non accelerated case is a bit difficult compared to the + * accelerated case. The right BTL could be in either the + * rdma or eager endpoint list, because we're using the am + * rdma interface to provide RDMA semantics. The important + * part is that we search the alternate_btls list in order, + * since it is sorted by latency. + */ + for (int osc_btl_idx = 0 ; osc_btl_idx < module->alternate_btl_count ; ++osc_btl_idx) { + mca_btl_base_module_t *search_btl = ompi_osc_rdma_selected_btl(module, osc_btl_idx); + const char *source = NULL; + + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "rank %d comparing with btl %s, %d", + ompi_comm_rank(module->comm), + search_btl->btl_component->btl_version.mca_component_name, + osc_btl_idx); + + source = "rdma"; + bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, search_btl); + if (NULL == bml_btl) { + source = "eager"; + bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_eager, search_btl); + } + if (NULL != bml_btl) { + *btl_index_out = osc_btl_idx; + *endpoint = bml_btl->btl_endpoint; - /* if this is a non-RDMA btl then the endpoint may be listed under eager */ - num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager); + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "rank %d found btl for peer %d (%s, %d, %s)", + ompi_comm_rank(module->comm), peer_id, + bml_btl->btl->btl_component->btl_version.mca_component_name, + osc_btl_idx, source); - for (int module_btl_index = 0 ; module_btl_index < module->btls_in_use ; ++module_btl_index) { - for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { - if (bml_endpoint->btl_eager.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]) { - *btl_index_out = module_btl_index; - *endpoint = bml_endpoint->btl_eager.bml_btls[btl_index].btl_endpoint; return OMPI_SUCCESS; } } } + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "rank %d: failed peer search for peer %d", + ompi_comm_rank(module->comm), peer_id); + /* unlikely but can happen when creating a peer for self */ return OMPI_ERR_UNREACH; } int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, ompi_osc_rdma_peer_t **peer_out) { - struct mca_btl_base_endpoint_t *endpoint; + struct mca_btl_base_endpoint_t *endpoint = NULL; ompi_osc_rdma_peer_t *peer; uint8_t module_btl_index = UINT8_MAX; @@ -84,8 +115,7 @@ int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, /* find a btl/endpoint to use for this peer */ int ret = ompi_osc_rdma_peer_btl_endpoint (module, peer_id, &module_btl_index, &endpoint); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret && !((module->selected_btls[0]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) && - peer_id == ompi_comm_rank (module->comm)))) { + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } @@ -134,7 +164,8 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "configuring peer for rank %d", peer->rank); if (module->use_memory_registration) { - registration_handle_size = module->selected_btls[0]->btl_registration_handle_size; + assert(module->use_accelerated_btl); + registration_handle_size = module->accelerated_btl->btl_registration_handle_size; } /* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code diff --git a/opal/mca/btl/base/btl_base_am_rdma.c b/opal/mca/btl/base/btl_base_am_rdma.c index 2b1e3400195..ae9683936e6 100644 --- a/opal/mca/btl/base/btl_base_am_rdma.c +++ b/opal/mca/btl/base/btl_base_am_rdma.c @@ -701,7 +701,7 @@ static int am_rdma_target_put(mca_btl_base_module_t *btl, (struct mca_btl_base_registration_handle_t *) (*operation)->local_handle_data, (struct mca_btl_base_registration_handle_t *) (*operation)->remote_handle_data, hdr->data.rdma.size, /*flags=*/0, MCA_BTL_NO_ORDER, am_rdma_rdma_complete, - operation, NULL); + *operation, NULL); if (OPAL_SUCCESS != ret) { OBJ_RELEASE(*operation); }