From e7f0d53062c7f00cc426badf689658ad4e13fa62 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 8 Sep 2020 22:07:48 +0200 Subject: [PATCH] ADAPT: fallback to previous coll module on non-commutative operations (#30) Signed-off-by: Joseph Schuchart --- ompi/mca/coll/adapt/coll_adapt.h | 33 ++++++++++++++++++++++++ ompi/mca/coll/adapt/coll_adapt_ireduce.c | 12 +++++++++ ompi/mca/coll/adapt/coll_adapt_module.c | 25 ++++++++++++++++++ ompi/mca/coll/adapt/coll_adapt_reduce.c | 12 +++++++++ 4 files changed, 82 insertions(+) diff --git a/ompi/mca/coll/adapt/coll_adapt.h b/ompi/mca/coll/adapt/coll_adapt.h index a5c5b4a5f4a..3d2240d58c8 100644 --- a/ompi/mca/coll/adapt/coll_adapt.h +++ b/ompi/mca/coll/adapt/coll_adapt.h @@ -73,11 +73,44 @@ typedef struct mca_coll_adapt_component_t { } mca_coll_adapt_component_t; +/* + * Structure used to store what is necessary for the collective operations + * routines in case of fallback. + */ +typedef struct mca_coll_adapt_collective_fallback_s { + union { + mca_coll_base_module_reduce_fn_t reduce; + mca_coll_base_module_ireduce_fn_t ireduce; + } previous_routine; + mca_coll_base_module_t *previous_module; +} mca_coll_adapt_collective_fallback_t; + + +typedef enum mca_coll_adapt_colltype { + ADAPT_REDUCE = 0, + ADAPT_IREDUCE = 1, + ADAPT_COLLCOUNT +} mca_coll_adapt_colltype_t; + +/* + * Some defines to stick to the naming used in the other components in terms of + * fallback routines + */ +#define previous_reduce previous_routines[ADAPT_REDUCE].previous_routine.reduce +#define previous_ireduce previous_routines[ADAPT_IREDUCE].previous_routine.ireduce + +#define previous_reduce_module previous_routines[ADAPT_REDUCE].previous_module +#define previous_ireduce_module previous_routines[ADAPT_IREDUCE].previous_module + + /* Coll adapt module per communicator*/ struct mca_coll_adapt_module_t { /* Base module */ mca_coll_base_module_t super; + /* To be able to fallback when the cases are not supported */ + struct mca_coll_adapt_collective_fallback_s previous_routines[ADAPT_COLLCOUNT]; + /* Whether this module has been lazily initialized or not yet */ bool adapt_enabled; }; diff --git a/ompi/mca/coll/adapt/coll_adapt_ireduce.c b/ompi/mca/coll/adapt/coll_adapt_ireduce.c index 1d3a3b62a58..58a8fac7940 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ireduce.c +++ b/ompi/mca/coll/adapt/coll_adapt_ireduce.c @@ -521,6 +521,18 @@ int ompi_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module) { + + /* Fall-back if operation is commutative */ + if (!ompi_op_is_commute(op)){ + mca_coll_adapt_module_t *adapt_module = (mca_coll_adapt_module_t *) module; + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "ADAPT cannot handle reduce with this (non-commutative) operation. It needs to fall back on another component\n")); + return adapt_module->previous_ireduce(sbuf, rbuf, count, dtype, op, root, + comm, request, + adapt_module->previous_reduce_module); + } + + OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "ireduce root %d, algorithm %d, coll_adapt_ireduce_segment_size %zu, coll_adapt_ireduce_max_send_requests %d, coll_adapt_ireduce_max_recv_requests %d\n", root, mca_coll_adapt_component.adapt_ireduce_algorithm, diff --git a/ompi/mca/coll/adapt/coll_adapt_module.c b/ompi/mca/coll/adapt/coll_adapt_module.c index 20f27d2ab24..1d61d66abf3 100644 --- a/ompi/mca/coll/adapt/coll_adapt_module.c +++ b/ompi/mca/coll/adapt/coll_adapt_module.c @@ -69,12 +69,37 @@ OBJ_CLASS_INSTANCE(mca_coll_adapt_module_t, adapt_module_construct, adapt_module_destruct); +/* + * In this macro, the following variables are supposed to have been declared + * in the caller: + * . ompi_communicator_t *comm + * . mca_coll_adapt_module_t *adapt_module + */ +#define ADAPT_SAVE_PREV_COLL_API(__api) \ + do { \ + adapt_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ + adapt_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module; \ + if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \ + opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ + "(%d/%s): no underlying " # __api"; disqualifying myself", \ + comm->c_contextid, comm->c_name); \ + return OMPI_ERROR; \ + } \ + OBJ_RETAIN(adapt_module->previous_ ## __api ## _module); \ + } while(0) + + /* * Init module on the communicator */ static int adapt_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm) { + mca_coll_adapt_module_t * adapt_module = (mca_coll_adapt_module_t*) module; + + ADAPT_SAVE_PREV_COLL_API(reduce); + ADAPT_SAVE_PREV_COLL_API(ireduce); + return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/adapt/coll_adapt_reduce.c b/ompi/mca/coll/adapt/coll_adapt_reduce.c index e3559ec20df..d0ad26d6e68 100644 --- a/ompi/mca/coll/adapt/coll_adapt_reduce.c +++ b/ompi/mca/coll/adapt/coll_adapt_reduce.c @@ -9,6 +9,8 @@ * $HEADER$ */ + +#include "ompi/op/op.h" #include "coll_adapt.h" #include "coll_adapt_algorithms.h" @@ -17,6 +19,16 @@ int ompi_coll_adapt_reduce(const void *sbuf, void *rbuf, int count, struct ompi_ struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { + /* Fall-back if operation is commutative */ + if (!ompi_op_is_commute(op)){ + mca_coll_adapt_module_t *adapt_module = (mca_coll_adapt_module_t *) module; + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "ADAPT cannot handle reduce with this (commutative) operation. It needs to fall back on another component\n")); + return adapt_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, + comm, + adapt_module->previous_reduce_module); + } + ompi_request_t *request = NULL; int err = ompi_coll_adapt_ireduce(sbuf, rbuf, count, dtype, op, root, comm, &request, module); if( MPI_SUCCESS != err ) {