From de24d99d81c634092b4b88ffc8263116cab48a6b Mon Sep 17 00:00:00 2001 From: Thomas Goncalves Date: Fri, 4 Sep 2020 11:38:29 +0200 Subject: [PATCH] [COLL/LIBNBC] Upgrade dynamic rules support Signed-off-by: Thomas Goncalves --- ompi/mca/coll/libnbc/Makefile.am | 2 + ompi/mca/coll/libnbc/coll_libnbc.h | 51 +++- ompi/mca/coll/libnbc/coll_libnbc_component.c | 233 +++++++----------- ompi/mca/coll/libnbc/help-mpi-coll-libnbc.txt | 147 +++++++++++ ompi/mca/coll/libnbc/nbc_iallgather.c | 60 ++++- ompi/mca/coll/libnbc/nbc_iallreduce.c | 85 +++++-- ompi/mca/coll/libnbc/nbc_ialltoall.c | 65 +++-- ompi/mca/coll/libnbc/nbc_ialltoallv.c | 80 +++++- ompi/mca/coll/libnbc/nbc_ialltoallw.c | 79 +++++- ompi/mca/coll/libnbc/nbc_ibcast.c | 178 ++++++++++--- ompi/mca/coll/libnbc/nbc_iexscan.c | 56 ++++- ompi/mca/coll/libnbc/nbc_ireduce.c | 98 ++++++-- ompi/mca/coll/libnbc/nbc_iscan.c | 55 ++++- 13 files changed, 928 insertions(+), 261 deletions(-) create mode 100644 ompi/mca/coll/libnbc/help-mpi-coll-libnbc.txt diff --git a/ompi/mca/coll/libnbc/Makefile.am b/ompi/mca/coll/libnbc/Makefile.am index 4afa48cdd2c..382c7884c5d 100644 --- a/ompi/mca/coll/libnbc/Makefile.am +++ b/ompi/mca/coll/libnbc/Makefile.am @@ -22,6 +22,8 @@ # $HEADER$ # +dist_ompidata_DATA = help-mpi-coll-libnbc.txt + sources = \ coll_libnbc.h \ coll_libnbc_component.c \ diff --git a/ompi/mca/coll/libnbc/coll_libnbc.h b/ompi/mca/coll/libnbc/coll_libnbc.h index bbd346e9c15..106c5baaf57 100644 --- a/ompi/mca/coll/libnbc/coll_libnbc.h +++ b/ompi/mca/coll/libnbc/coll_libnbc.h @@ -17,6 +17,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,6 +31,8 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_base_util.h" #include "opal/sys/atomic.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_base_dynamic_rules.h" BEGIN_C_DECLS @@ -69,14 +72,15 @@ BEGIN_C_DECLS /* number of implemented collective functions */ #define NBC_NUM_COLL 17 -extern bool libnbc_ibcast_skip_dt_decision; -extern int libnbc_iallgather_algorithm; -extern int libnbc_iallreduce_algorithm; -extern int libnbc_ibcast_algorithm; -extern int libnbc_ibcast_knomial_radix; -extern int libnbc_iexscan_algorithm; -extern int libnbc_ireduce_algorithm; -extern int libnbc_iscan_algorithm; +/* forced algorithm choices */ +/* this structure is for storing the indexes to the forced algorithm mca params... */ +/* we get these at component query (so that registered values appear in ompi_info) */ +struct coll_libnbc_force_algorithm_mca_param_indices_t { + int algorithm; /* which algorithm you want to force */ + int segsize; + int topo; +}; +typedef struct coll_libnbc_force_algorithm_mca_param_indices_t coll_libnbc_force_algorithm_mca_param_indices_t; struct ompi_coll_libnbc_component_t { mca_coll_base_component_2_0_0_t super; @@ -84,6 +88,13 @@ struct ompi_coll_libnbc_component_t { opal_list_t active_requests; int32_t active_comms; opal_mutex_t lock; /* protect access to the active_requests list */ + int dynamic_rules_verbose; + int stream; + coll_libnbc_force_algorithm_mca_param_indices_t forced_params[COLLCOUNT]; + /* cached decision table stuff */ + ompi_coll_base_alg_rule_t *all_base_rules; + int dynamic_rules_fileformat; + char* dynamic_rules_filename; }; typedef struct ompi_coll_libnbc_component_t ompi_coll_libnbc_component_t; @@ -94,6 +105,9 @@ struct ompi_coll_libnbc_module_t { mca_coll_base_module_t super; opal_mutex_t mutex; bool comm_registered; + + /* the communicator rules for each MPI collective for ONLY my comsize */ + ompi_coll_base_com_rule_t *com_rules[COLLCOUNT]; #ifdef NBC_CACHE_SCHEDULE void *NBC_Dict[NBC_NUM_COLL]; /* this should point to a struct hb_tree, but since this is a @@ -160,6 +174,27 @@ int ompi_coll_libnbc_progress(void); int NBC_Init_comm(MPI_Comm comm, ompi_coll_libnbc_module_t *module); int NBC_Progress(NBC_Handle *handle); +int ompi_coll_libnbc_allgather_check_forced_init (void); +int ompi_coll_libnbc_allreduce_check_forced_init (void); +int ompi_coll_libnbc_alltoall_check_forced_init (void); +int ompi_coll_libnbc_alltoallv_check_forced_init (void); +int ompi_coll_libnbc_alltoallw_check_forced_init (void); +int ompi_coll_libnbc_barrier_check_forced_init (void); +int ompi_coll_libnbc_bcast_check_forced_init (void); +int ompi_coll_libnbc_exscan_check_forced_init (void); +int ompi_coll_libnbc_gather_check_forced_init (void); +int ompi_coll_libnbc_gatherv_check_forced_init (void); +int ompi_coll_libnbc_reduce_check_forced_init (void); +int ompi_coll_libnbc_reduce_scatter_check_forced_init (void); +int ompi_coll_libnbc_reduce_scatter_block_check_forced_init (void); +int ompi_coll_libnbc_scan_check_forced_init (void); +int ompi_coll_libnbc_scatter_check_forced_init (void); +int ompi_coll_libnbc_scatterv_check_forced_init (void); +int ompi_coll_libnbc_neighbor_allgather_check_forced_init (void); +int ompi_coll_libnbc_neighbor_allgatherv_check_forced_init (void); +int ompi_coll_libnbc_neighbor_alltoall_check_forced_init (void); +int ompi_coll_libnbc_neighbor_alltoallv_check_forced_init (void); +int ompi_coll_libnbc_neighbor_alltoallw_check_forced_init (void); int ompi_coll_libnbc_iallgather(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, diff --git a/ompi/mca/coll/libnbc/coll_libnbc_component.c b/ompi/mca/coll/libnbc/coll_libnbc_component.c index bcb0e06c2d9..95f896daa09 100644 --- a/ompi/mca/coll/libnbc/coll_libnbc_component.c +++ b/ompi/mca/coll/libnbc/coll_libnbc_component.c @@ -19,6 +19,7 @@ * Copyright (c) 2017 Ian Bradley Morgan and Anthony Skjellum. All * rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,6 +35,8 @@ #include "mpi.h" #include "ompi/mca/coll/coll.h" #include "ompi/communicator/communicator.h" +#include "ompi/mca/coll/base/coll_base_dynamic_file.h" +#include "opal/util/show_help.h" /* * Public string showing the coll ompi_libnbc component version number @@ -44,61 +47,6 @@ const char *mca_coll_libnbc_component_version_string = static int libnbc_priority = 10; static bool libnbc_in_progress = false; /* protect from recursive calls */ -bool libnbc_ibcast_skip_dt_decision = true; - -int libnbc_iallgather_algorithm = 0; /* iallgather user forced algorithm */ -static mca_base_var_enum_value_t iallgather_algorithms[] = { - {0, "ignore"}, - {1, "linear"}, - {2, "recursive_doubling"}, - {0, NULL} -}; - -int libnbc_iallreduce_algorithm = 0; /* iallreduce user forced algorithm */ -static mca_base_var_enum_value_t iallreduce_algorithms[] = { - {0, "ignore"}, - {1, "ring"}, - {2, "binomial"}, - {3, "rabenseifner"}, - {4, "recursive_doubling"}, - {0, NULL} -}; - -int libnbc_ibcast_algorithm = 0; /* ibcast user forced algorithm */ -int libnbc_ibcast_knomial_radix = 4; -static mca_base_var_enum_value_t ibcast_algorithms[] = { - {0, "ignore"}, - {1, "linear"}, - {2, "binomial"}, - {3, "chain"}, - {4, "knomial"}, - {0, NULL} -}; - -int libnbc_iexscan_algorithm = 0; /* iexscan user forced algorithm */ -static mca_base_var_enum_value_t iexscan_algorithms[] = { - {0, "ignore"}, - {1, "linear"}, - {2, "recursive_doubling"}, - {0, NULL} -}; - -int libnbc_ireduce_algorithm = 0; /* ireduce user forced algorithm */ -static mca_base_var_enum_value_t ireduce_algorithms[] = { - {0, "ignore"}, - {1, "chain"}, - {2, "binomial"}, - {3, "rabenseifner"}, - {0, NULL} -}; - -int libnbc_iscan_algorithm = 0; /* iscan user forced algorithm */ -static mca_base_var_enum_value_t iscan_algorithms[] = { - {0, "ignore"}, - {1, "linear"}, - {2, "recursive_doubling"}, - {0, NULL} -}; static int libnbc_open(void); static int libnbc_close(void); @@ -145,6 +93,38 @@ static int libnbc_open(void) { int ret; + if (mca_coll_libnbc_component.dynamic_rules_verbose > 0) { + mca_coll_libnbc_component.stream = opal_output_open(NULL); + opal_output_set_verbosity(mca_coll_libnbc_component.stream, mca_coll_libnbc_component.dynamic_rules_verbose); + } else { + mca_coll_libnbc_component.stream = -1; + } + if(mca_coll_libnbc_component.dynamic_rules_filename ) { + int rc; + opal_output_verbose(10, mca_coll_libnbc_component.stream , + "coll:libnbc:component_open Reading collective rules file [%s] which format is %d", + mca_coll_libnbc_component.dynamic_rules_filename, + mca_coll_libnbc_component.dynamic_rules_fileformat); + rc = ompi_coll_base_read_rules_config_file( mca_coll_libnbc_component.dynamic_rules_filename, + mca_coll_libnbc_component.dynamic_rules_fileformat, + &(mca_coll_libnbc_component.all_base_rules), COLLCOUNT); + if( rc >= 0 ) { + opal_output_verbose(10, mca_coll_libnbc_component.stream ,"coll:libnbc:module_open Read %d valid rules\n", rc); + if(ompi_coll_base_framework.framework_verbose >= 50) { + ompi_coll_base_dump_all_rules (mca_coll_libnbc_component.all_base_rules, COLLCOUNT); + } + } else { + opal_output_verbose(1, mca_coll_libnbc_component.stream ,"coll:libnbc:module_open Reading collective rules file failed\n"); + char error_name[12]; + sprintf(error_name,"file fail%1d", rc); + error_name[11] = '\0'; + opal_show_help("help-mpi-coll-libnbc.txt", (const char*)error_name, true, + mca_coll_libnbc_component.dynamic_rules_filename, mca_coll_libnbc_component.dynamic_rules_fileformat); + mca_coll_libnbc_component.all_base_rules = NULL; + } + } else { + mca_coll_libnbc_component.all_base_rules = NULL; + } OBJ_CONSTRUCT(&mca_coll_libnbc_component.requests, opal_free_list_t); OBJ_CONSTRUCT(&mca_coll_libnbc_component.active_requests, opal_list_t); @@ -173,6 +153,14 @@ libnbc_close(void) OBJ_DESTRUCT(&mca_coll_libnbc_component.active_requests); OBJ_DESTRUCT(&mca_coll_libnbc_component.lock); + if( NULL != mca_coll_libnbc_component.all_base_rules ) { + ompi_coll_base_free_all_rules(mca_coll_libnbc_component.all_base_rules, COLLCOUNT); + mca_coll_libnbc_component.all_base_rules = NULL; + } + /* close stream */ + if(mca_coll_libnbc_component.stream >= 0) { + opal_output_close(mca_coll_libnbc_component.stream); + } return OMPI_SUCCESS; } @@ -191,94 +179,42 @@ libnbc_register(void) MCA_BASE_VAR_SCOPE_READONLY, &libnbc_priority); - /* ibcast decision function can make the wrong decision if a legal - * non-uniform data type signature is used. This has resulted in the - * collective operation failing, and possibly producing wrong answers. - * We are investigating a fix for this problem, but it is taking a while. - * https://github.com/open-mpi/ompi/issues/2256 - * https://github.com/open-mpi/ompi/issues/1763 - * As a result we are adding an MCA parameter to make a conservative - * decision to avoid this issue. If the user knows that their application - * does not use data types in this way, then they can set this parameter - * to get the old behavior. Once the issue is truely fixed, then this - * parameter can be removed. - */ - libnbc_ibcast_skip_dt_decision = true; - (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "ibcast_skip_dt_decision", - "In ibcast only use size of communicator to choose algorithm, exclude data type signature. Set to 'false' to use data type signature in decision. WARNING: If you set this to 'false' then your application should not use non-uniform data type signatures in calls to ibcast.", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + mca_coll_libnbc_component.dynamic_rules_verbose = 0; + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, "dynamic_rules_verbose", + "Verbose level of the libnbc coll component regarding on dynamic rules." + " Examples: 0: no verbose, 1: selection errors, 10: selection output", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &libnbc_ibcast_skip_dt_decision); - - libnbc_iallgather_algorithm = 0; - (void) mca_base_var_enum_create("coll_libnbc_iallgather_algorithms", iallgather_algorithms, &new_enum); - mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "iallgather_algorithm", - "Which iallgather algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &libnbc_iallgather_algorithm); - OBJ_RELEASE(new_enum); - - libnbc_iallreduce_algorithm = 0; - (void) mca_base_var_enum_create("coll_libnbc_iallreduce_algorithms", iallreduce_algorithms, &new_enum); - mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "iallreduce_algorithm", - "Which iallreduce algorithm is used: 0 ignore, 1 ring, 2 binomial, 3 rabenseifner, 4 recursive_doubling", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &libnbc_iallreduce_algorithm); - OBJ_RELEASE(new_enum); - - libnbc_ibcast_algorithm = 0; - (void) mca_base_var_enum_create("coll_libnbc_ibcast_algorithms", ibcast_algorithms, &new_enum); - mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "ibcast_algorithm", - "Which ibcast algorithm is used: 0 ignore, 1 linear, 2 binomial, 3 chain, 4 knomial", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &libnbc_ibcast_algorithm); - OBJ_RELEASE(new_enum); - - libnbc_ibcast_knomial_radix = 4; + &mca_coll_libnbc_component.dynamic_rules_verbose); + + mca_coll_libnbc_component.dynamic_rules_filename = NULL; (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "ibcast_knomial_radix", "k-nomial tree radix for the ibcast algorithm (radix > 1)", + "dynamic_rules_filename", + "Filename of configuration file that contains the dynamic (@runtime) decision function rules", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_libnbc_component.dynamic_rules_filename); + + mca_coll_libnbc_component.dynamic_rules_fileformat = 0; + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "dynamic_rules_fileformat", + "Format of configuration file that contains the dynamic (@runtime) decision function rules. Accepted values are: 0 , 1 ", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, + OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, - &libnbc_ibcast_knomial_radix); - - libnbc_iexscan_algorithm = 0; - (void) mca_base_var_enum_create("coll_libnbc_iexscan_algorithms", iexscan_algorithms, &new_enum); - mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "iexscan_algorithm", - "Which iexscan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &libnbc_iexscan_algorithm); - OBJ_RELEASE(new_enum); - - libnbc_ireduce_algorithm = 0; - (void) mca_base_var_enum_create("coll_libnbc_ireduce_algorithms", ireduce_algorithms, &new_enum); - mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "ireduce_algorithm", - "Which ireduce algorithm is used: 0 ignore, 1 chain, 2 binomial, 3 rabenseifner", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &libnbc_ireduce_algorithm); - OBJ_RELEASE(new_enum); - - libnbc_iscan_algorithm = 0; - (void) mca_base_var_enum_create("coll_libnbc_iscan_algorithms", iscan_algorithms, &new_enum); - mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "iscan_algorithm", - "Which iscan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &libnbc_iscan_algorithm); - OBJ_RELEASE(new_enum); + &mca_coll_libnbc_component.dynamic_rules_fileformat); + + ompi_coll_libnbc_allgather_check_forced_init (); + ompi_coll_libnbc_allreduce_check_forced_init (); + ompi_coll_libnbc_alltoall_check_forced_init (); + ompi_coll_libnbc_alltoallv_check_forced_init (); + ompi_coll_libnbc_alltoallw_check_forced_init (); + ompi_coll_libnbc_bcast_check_forced_init (); + ompi_coll_libnbc_exscan_check_forced_init (); + ompi_coll_libnbc_reduce_check_forced_init (); + ompi_coll_libnbc_scan_check_forced_init (); return OMPI_SUCCESS; } @@ -417,6 +353,27 @@ static int libnbc_module_enable(mca_coll_base_module_t *module, struct ompi_communicator_t *comm) { + ompi_coll_libnbc_module_t* nbc_module = (ompi_coll_libnbc_module_t*) module; + int i; + if(mca_coll_libnbc_component.all_base_rules) { + int size, nnodes; + /* Allocate the data that hangs off the communicator */ + if (OMPI_COMM_IS_INTER(comm)) { + size = ompi_comm_remote_size(comm); + } else { + size = ompi_comm_size(comm); + } + /* Get the number of nodes in communicator */ + nnodes = ompi_coll_base_get_nnodes(comm); + for(i=0;icom_rules[i] = ompi_coll_base_get_com_rule_ptr(mca_coll_libnbc_component.all_base_rules, + i, nnodes, size ); + } + } else { + for(i=0;icom_rules[i] = NULL; + } + } /* All done */ return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/libnbc/help-mpi-coll-libnbc.txt b/ompi/mca/coll/libnbc/help-mpi-coll-libnbc.txt new file mode 100644 index 00000000000..6e4e9163943 --- /dev/null +++ b/ompi/mca/coll/libnbc/help-mpi-coll-libnbc.txt @@ -0,0 +1,147 @@ +# -*- text -*- +# +# Copyright (c) 2020 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for Open MPI's libnbc component +# (which use base config file parser). +# +[file fail-1] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The coll file parser was called with a null filename string. +This is an internal error. Parser should be used only if a rules filename was set by user. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-2] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The coll file parser was called with a null rules pointer. +This is an internal error. Rules object pointer must be valid. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-3] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The coll file parser was called with an invalid max collectives number. +This is an internal error. Max collectives number must be greater than 0. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-4] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The file can't be opened. Either file is missing or access rights are wrong. +Check if the file path defined by the mca parameter +OMPI_MCA_coll_libnbc_dynamic_rules_filename is valid. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-5] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The file format is unknown. It must be either + 0: Rules are defined according to a combination of + collective_id, communicator size and message size + or 1: Rules are defined according to a combination of + collective_id, communicator nodes number, communicator size and message size. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-6] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +Rules object allocation failed. This can be induced by a memory resource exhaustion. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-7] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The file parsing aborted. This can be induced by either: + a format mismatch + or a missing line + or an invalid configuration number. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-8] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +Unconsistent collective id or collective number. Collective number can-t be greater than 22 and collective IDs belong to [0,21]. +As a reminder, the collectives IDs are listed below: +IALLGATHER = 0, IALLGATHERV = 1, IALLREDUCE = 2, +IALLTOALL = 3, IALLTOALLV = 4, IALLTOALLW = 5, +IBARRIER = 6, IBCAST = 7, IEXSCAN = 8, IGATHER = 9, IGATHERV = 10, +IREDUCE = 11, IREDUCESCATTER = 12, IREDUCESCATTERBLOCK = 13, ISCAN = 14, +ISCATTER = 15, ISCATTERV = 16, INEIGHBOR_ALLGATHER = 17, INEIGHBOR_ALLGATHERV = 18, +INEIGHBOR_ALLTOALL = 19, INEIGHBOR_ALLTOALLV = 20, INEIGHBOR_ALLTOALLW = 21. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-9] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +Incosistent collective ID in rules object. This is an internal error. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) diff --git a/ompi/mca/coll/libnbc/nbc_iallgather.c b/ompi/mca/coll/libnbc/nbc_iallgather.c index 29ba7a6a9c1..e6f3dd867ef 100644 --- a/ompi/mca/coll/libnbc/nbc_iallgather.c +++ b/ompi/mca/coll/libnbc/nbc_iallgather.c @@ -11,6 +11,7 @@ * reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,6 +30,35 @@ static inline int allgather_sched_recursivedoubling( int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype); +int libnbc_iallgather_algorithm = 0; /* iallgather user forced algorithm */ +static mca_base_var_enum_value_t iallgather_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "recursive_doubling"}, + {0, NULL} +}; + +/* The following are used by dynamic and forced rules */ +/* this routine is called by the component only */ + +int ompi_coll_libnbc_allgather_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[ALLGATHER].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_iallgather_algorithms", iallgather_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "iallgather_algorithm", + "Which iallgather algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[ALLGATHER].algorithm); + + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Allgather_args_compare(NBC_Allgather_args *a, NBC_Allgather_args *b, void *param) { @@ -69,19 +99,35 @@ static int nbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype s p = ompi_comm_size (comm); int is_commsize_pow2 = !(p & (p - 1)); - if (libnbc_iallgather_algorithm == 0) { + if(!is_commsize_pow2) { + /* default */ alg = NBC_ALLGATHER_LINEAR; - } else { - /* user forced dynamic decision */ - if (libnbc_iallgather_algorithm == 1) { - alg = NBC_ALLGATHER_LINEAR; - } else if (libnbc_iallgather_algorithm == 2 && is_commsize_pow2) { - alg = NBC_ALLGATHER_RDBL; + } else if (libnbc_module->com_rules[ALLGATHER]) { + int algorithm,dummy1,dummy2,dummy3; + /* compute data size to choose correct rule */ + size_t dsize; + ompi_datatype_type_size (sendtype, &dsize); + dsize *= sendcount; + /* get algorithm */ + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[ALLGATHER], + dsize, &dummy1, &dummy2, &dummy3); + if (algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ } else { + /* default */ alg = NBC_ALLGATHER_LINEAR; } + } else if (0 != mca_coll_libnbc_component.forced_params[ALLGATHER].algorithm) { + alg = mca_coll_libnbc_component.forced_params[ALLGATHER].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + /* default */ + alg = NBC_ALLGATHER_LINEAR; } + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc iallgather : algorithm %d (no segmentation supported)", + alg + 1); + res = ompi_datatype_type_extent(recvtype, &rcvext); if (MPI_SUCCESS != res) { return res; diff --git a/ompi/mca/coll/libnbc/nbc_iallreduce.c b/ompi/mca/coll/libnbc/nbc_iallreduce.c index b8e9f27cbdf..5c91c0c4ac3 100644 --- a/ompi/mca/coll/libnbc/nbc_iallreduce.c +++ b/ompi/mca/coll/libnbc/nbc_iallreduce.c @@ -11,6 +11,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -42,6 +43,38 @@ static inline int allred_sched_redscat_allgather( const void *sbuf, void *rbuf, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf, struct ompi_communicator_t *comm); +static mca_base_var_enum_value_t iallreduce_algorithms[] = { + {0, "ignore"}, + {1, "ring"}, + {2, "binomial"}, + {3, "rabenseifner"}, + {4, "recursive_doubling"}, + {0, NULL} +}; + +typedef enum { NBC_ARED_BINOMIAL, NBC_ARED_RING, NBC_ARED_REDSCAT_ALLGATHER, NBC_ARED_RDBL } ared_algorithm_t; + +/* The following are used by dynamic and forced rules */ +/* this routine is called by the component only */ + +int ompi_coll_libnbc_allreduce_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[ALLREDUCE].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_iallreduce_algorithms", iallreduce_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "iallreduce_algorithm", + "Which iallreduce algorithm is used: 0 ignore, 1 ring, 2 binomial, 3 rabenseifner, 4 recursive_doubling", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[ALLREDUCE].algorithm); + + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Allreduce_args_compare(NBC_Allreduce_args *a, NBC_Allreduce_args *b, void *param) { @@ -61,6 +94,18 @@ int NBC_Allreduce_args_compare(NBC_Allreduce_args *a, NBC_Allreduce_args *b, voi } #endif +static ared_algorithm_t nbc_allreduce_default_algorithm(int p, size_t size, int count, + MPI_Op op, char inplace, int nprocs_pof2) +{ + if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) { + return NBC_ARED_BINOMIAL; + } else if (count >= nprocs_pof2 && ompi_op_is_commute(op)) { + return NBC_ARED_REDSCAT_ALLGATHER; + } else { + return NBC_ARED_RING; + } +} + static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) @@ -72,7 +117,7 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI #ifdef NBC_CACHE_SCHEDULE NBC_Allreduce_args *args, *found, search; #endif - enum { NBC_ARED_BINOMIAL, NBC_ARED_RING, NBC_ARED_REDSCAT_ALLGATHER, NBC_ARED_RDBL } alg; + ared_algorithm_t alg; char inplace; void *tmpbuf = NULL; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; @@ -114,26 +159,32 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI /* algorithm selection */ int nprocs_pof2 = opal_next_poweroftwo(p) >> 1; - if (libnbc_iallreduce_algorithm == 0) { - if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) { - alg = NBC_ARED_BINOMIAL; - } else if (count >= nprocs_pof2 && ompi_op_is_commute(op)) { - alg = NBC_ARED_REDSCAT_ALLGATHER; + + if(libnbc_module->com_rules[ALLREDUCE]) { + int algorithm,dummy1,dummy2,dummy3; + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[ALLREDUCE], + size * count, &dummy1, &dummy2, &dummy3); + if(algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ } else { - alg = NBC_ARED_RING; + /* default */ + alg = nbc_allreduce_default_algorithm (p, size, count, op, inplace, nprocs_pof2); } + } else if(0 != mca_coll_libnbc_component.forced_params[ALLREDUCE].algorithm) { + /* if op is not commutative or MPI_IN_PLACE was specified we have to deal with it */ + alg = mca_coll_libnbc_component.forced_params[ALLREDUCE].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ } else { - if (libnbc_iallreduce_algorithm == 1) - alg = NBC_ARED_RING; - else if (libnbc_iallreduce_algorithm == 2) - alg = NBC_ARED_BINOMIAL; - else if (libnbc_iallreduce_algorithm == 3 && count >= nprocs_pof2 && ompi_op_is_commute(op)) - alg = NBC_ARED_REDSCAT_ALLGATHER; - else if (libnbc_iallreduce_algorithm == 4) - alg = NBC_ARED_RDBL; - else - alg = NBC_ARED_RING; + /* default */ + alg = nbc_allreduce_default_algorithm (p, size, count, op, inplace, nprocs_pof2); + } + + if (NBC_ARED_REDSCAT_ALLGATHER == alg && (count < nprocs_pof2 || !ompi_op_is_commute(op))) { + alg = NBC_ARED_RING; } + + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc iallreduce : algorithm %d (no segmentation supported)", + alg + 1); #ifdef NBC_CACHE_SCHEDULE /* search schedule in communicator specific tree */ search.sendbuf = sendbuf; diff --git a/ompi/mca/coll/libnbc/nbc_ialltoall.c b/ompi/mca/coll/libnbc/nbc_ialltoall.c index 6c5883f23c9..029ec9e11a7 100644 --- a/ompi/mca/coll/libnbc/nbc_ialltoall.c +++ b/ompi/mca/coll/libnbc/nbc_ialltoall.c @@ -12,6 +12,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -33,6 +34,34 @@ static inline int a2a_sched_diss(int rank, int p, MPI_Aint sndext, MPI_Aint rcve static inline int a2a_sched_inplace(int rank, int p, NBC_Schedule* schedule, void* buf, int count, MPI_Datatype type, MPI_Aint ext, ptrdiff_t gap, MPI_Comm comm); +static mca_base_var_enum_value_t ialltoall_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "pairwise"}, + {3, "binomial"}, + {0, NULL} +}; + +/* The following are used by dynamic and forced rules */ +/* this routine is called by the component only */ + +int ompi_coll_libnbc_alltoall_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[ALLTOALL].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_ialltoall_algorithms", ialltoall_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ialltoall_algorithm", + "Which ialltoall algorithm is used unless MPI_IN_PLACE flag has been specified: 0 ignore, 1 linear, 2 pairwise, 3 binomial", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[ALLTOALL].algorithm); + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Alltoall_args_compare(NBC_Alltoall_args *a, NBC_Alltoall_args *b, void *param) { @@ -60,7 +89,7 @@ static int nbc_alltoall_init(const void* sendbuf, int sendcount, MPI_Datatype se { int rank, p, res; MPI_Aint datasize; - size_t a2asize, sndsize; + size_t sndsize; NBC_Schedule *schedule; MPI_Aint rcvext, sndext; #ifdef NBC_CACHE_SCHEDULE @@ -95,22 +124,26 @@ static int nbc_alltoall_init(const void* sendbuf, int sendcount, MPI_Datatype se return res; } - /* algorithm selection */ - a2asize = sndsize * sendcount * p; - /* this number is optimized for TCP on odin.cs.indiana.edu */ - if (inplace) { + if(inplace) { alg = NBC_A2A_INPLACE; - } else if((p <= 8) && ((a2asize < 1<<17) || (sndsize*sendcount < 1<<12))) { - /* just send as fast as we can if we have less than 8 peers, if the - * total communicated size is smaller than 1<<17 *and* if we don't - * have eager messages (msgsize < 1<<13) */ - alg = NBC_A2A_LINEAR; - } else if(a2asize < (1<<12)*(unsigned int)p) { - /*alg = NBC_A2A_DISS;*/ - alg = NBC_A2A_LINEAR; - } else - alg = NBC_A2A_LINEAR; /*NBC_A2A_PAIRWISE;*/ + } else if(libnbc_module->com_rules[ALLTOALL]) { + int algorithm,dummy1,dummy2,dummy3; + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[ALLTOALL], + sndsize * sendcount, &dummy1, &dummy2, &dummy3); + if(algorithm) { + alg = algorithm - 1; + } else { + alg = NBC_A2A_LINEAR; /* default if not inplace */ + } + } else if(0 != mca_coll_libnbc_component.forced_params[ALLTOALL].algorithm) { + alg = mca_coll_libnbc_component.forced_params[ALLTOALL].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_A2A_LINEAR; /* default if not inplace */ + } + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc ialltoall : algorithm %d (no segmentation supported)", + alg + 1); /* allocate temp buffer if we need one */ if (alg == NBC_A2A_INPLACE) { span = opal_datatype_span(&recvtype->super, recvcount, &gap); @@ -197,7 +230,7 @@ static int nbc_alltoall_init(const void* sendbuf, int sendcount, MPI_Datatype se free(tmpbuf); return OMPI_ERR_OUT_OF_RESOURCE; } - + // cppcheck-suppress knownConditionTrueFalse if (!inplace) { /* copy my data to receive buffer */ rbuf = (char *) recvbuf + (MPI_Aint)rank * (MPI_Aint)recvcount * rcvext; diff --git a/ompi/mca/coll/libnbc/nbc_ialltoallv.c b/ompi/mca/coll/libnbc/nbc_ialltoallv.c index 5d13d524ea4..91743426e49 100644 --- a/ompi/mca/coll/libnbc/nbc_ialltoallv.c +++ b/ompi/mca/coll/libnbc/nbc_ialltoallv.c @@ -11,6 +11,7 @@ * reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -36,6 +37,34 @@ static inline int a2av_sched_inplace(int rank, int p, NBC_Schedule *schedule, void *buf, const int *counts, const int *displs, MPI_Aint ext, MPI_Datatype type, ptrdiff_t gap); +static mca_base_var_enum_value_t ialltoallv_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "pairwise"}, + {0, NULL} +}; + +/* The following are used by dynamic and forced rules */ +/* this routine is called by the component only */ + +int ompi_coll_libnbc_alltoallv_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[ALLTOALLV].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_ialltoallv_algorithms", ialltoallv_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ialltoallv_algorithm", + "Which ialltoallv algorithm is used unless MPI_IN_PLACE flag has been specified: 0 ignore, 1 linear, 2 pairwise", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[ALLTOALLV].algorithm); + + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + /* an alltoallv schedule can not be cached easily because the contents * ot the recvcounts array may change, so a comparison of the address * would not be sufficient ... we simply do not cache it */ @@ -65,6 +94,34 @@ static int nbc_alltoallv_init(const void* sendbuf, const int *sendcounts, const return res; } + enum {NBC_A2AV_LINEAR, NBC_A2AV_PAIRWISE, NBC_A2AV_INPLACE} alg; + + if (inplace) { + alg = NBC_A2AV_INPLACE; + } else if (libnbc_module->com_rules[ALLTOALLV]) { + int algorithm,dummy1,dummy2,dummy3; + /** + * check to see if we have some filebased rules. As we don't have global + * knowledge about the total amount of data, use the first available rule. + * This allow the users to specify the alltoallv algorithm to be used only + * based on the communicator size. + */ + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[ALLTOALLV], + 0, &dummy1, &dummy2, &dummy3); + if (algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_A2AV_LINEAR; /* default if not inplace */ + } + } else if (0 != mca_coll_libnbc_component.forced_params[ALLTOALLV].algorithm) { + alg = mca_coll_libnbc_component.forced_params[ALLTOALLV].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_A2AV_LINEAR; /* default if not inplace */ + } + + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc ialltoallv : algorithm %d (no segmentation supported)", + alg + 1); /* copy data to receivbuffer */ if (inplace) { int count = 0; @@ -109,14 +166,23 @@ static int nbc_alltoallv_init(const void* sendbuf, const int *sendcounts, const } } - if (inplace) { - res = a2av_sched_inplace(rank, p, schedule, recvbuf, recvcounts, - rdispls, rcvext, recvtype, gap); - } else { - res = a2av_sched_linear(rank, p, schedule, - sendbuf, sendcounts, sdispls, sndext, sendtype, - recvbuf, recvcounts, rdispls, rcvext, recvtype); + switch (alg) { + case NBC_A2AV_INPLACE: + res = a2av_sched_inplace(rank, p, schedule, recvbuf, recvcounts, + rdispls, rcvext, recvtype, gap); + break; + case NBC_A2AV_LINEAR: + res = a2av_sched_linear(rank, p, schedule, + sendbuf, sendcounts, sdispls, sndext, sendtype, + recvbuf, recvcounts, rdispls, rcvext, recvtype); + break; + case NBC_A2AV_PAIRWISE: + res = a2av_sched_pairwise(rank, p, schedule, + sendbuf, sendcounts, sdispls, sndext, sendtype, + recvbuf, recvcounts, rdispls, rcvext, recvtype); + break; } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); diff --git a/ompi/mca/coll/libnbc/nbc_ialltoallw.c b/ompi/mca/coll/libnbc/nbc_ialltoallw.c index ae293697c7c..42ea574d27f 100644 --- a/ompi/mca/coll/libnbc/nbc_ialltoallw.c +++ b/ompi/mca/coll/libnbc/nbc_ialltoallw.c @@ -11,6 +11,7 @@ * reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -36,6 +37,34 @@ static inline int a2aw_sched_inplace(int rank, int p, NBC_Schedule *schedule, void *buf, const int *counts, const int *displs, struct ompi_datatype_t * const * types); +static mca_base_var_enum_value_t ialltoallw_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "pairwise"}, + {0, NULL} +}; + +/* The following are used by dynamic and forced rules */ +/* this routine is called by the component only */ + +int ompi_coll_libnbc_alltoallw_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[ALLTOALLW].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_alltoallw_algorithms", ialltoallw_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ialltoallw_algorithm", + "Which ialltoallw algorithm is used unless MPI_IN_PLACE flag has been specified: 0 ignore, 1 linear, 2 pairwise", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[ALLTOALLW].algorithm); + + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + /* an alltoallw schedule can not be cached easily because the contents * ot the recvcounts array may change, so a comparison of the address * would not be sufficient ... we simply do not cache it */ @@ -58,6 +87,33 @@ static int nbc_alltoallw_init(const void* sendbuf, const int *sendcounts, const rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); + enum {NBC_A2AW_LINEAR, NBC_A2AW_PAIRWISE, NBC_A2AW_INPLACE} alg; + if (inplace) { + alg = NBC_A2AW_INPLACE; + } else if (libnbc_module->com_rules[ALLTOALLW]) { + int algorithm,dummy1,dummy2,dummy3; + /** + * check to see if we have some filebased rules. As we don't have global + * knowledge about the total amount of data, use the first available rule. + * This allow the users to specify the alltoallw algorithm to be used only + * based on the communicator size. + */ + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[ALLTOALLW], + 0, &dummy1, &dummy2, &dummy3); + if (algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_A2AW_LINEAR; /* default if not inplace */ + } + } else if (0 != mca_coll_libnbc_component.forced_params[ALLTOALLW].algorithm) { + alg = mca_coll_libnbc_component.forced_params[ALLTOALLW].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_A2AW_LINEAR; /* default if not inplace */ + } + + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc ialltoallw : algorithm %d (no segmentation supported)", + alg + 1); /* copy data to receivbuffer */ if (inplace) { ptrdiff_t lgap, lspan; @@ -95,14 +151,23 @@ static int nbc_alltoallw_init(const void* sendbuf, const int *sendcounts, const } } - if (inplace) { - res = a2aw_sched_inplace(rank, p, schedule, recvbuf, - recvcounts, rdispls, recvtypes); - } else { - res = a2aw_sched_linear(rank, p, schedule, - sendbuf, sendcounts, sdispls, sendtypes, - recvbuf, recvcounts, rdispls, recvtypes); + switch(alg) { + case NBC_A2AW_INPLACE: + res = a2aw_sched_inplace(rank, p, schedule, recvbuf, + recvcounts, rdispls, recvtypes); + break; + case NBC_A2AW_LINEAR: + res = a2aw_sched_linear(rank, p, schedule, + sendbuf, sendcounts, sdispls, sendtypes, + recvbuf, recvcounts, rdispls, recvtypes); + break; + case NBC_A2AW_PAIRWISE: + res = a2aw_sched_pairwise(rank, p, schedule, + sendbuf, sendcounts, sdispls, sendtypes, + recvbuf, recvcounts, rdispls, recvtypes); + break; } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); diff --git a/ompi/mca/coll/libnbc/nbc_ibcast.c b/ompi/mca/coll/libnbc/nbc_ibcast.c index cbd381328d0..ea627c30e18 100644 --- a/ompi/mca/coll/libnbc/nbc_ibcast.c +++ b/ompi/mca/coll/libnbc/nbc_ibcast.c @@ -11,6 +11,7 @@ * reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -20,6 +21,9 @@ */ #include "nbc_internal.h" +#define IBCAST_DEFAULT_RADIX 4 +#define IBCAST_DEFAULT_SEGSIZE 16384 + static inline int bcast_sched_binomial(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype); static inline int bcast_sched_linear(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, @@ -29,6 +33,82 @@ static inline int bcast_sched_chain(int rank, int p, int root, NBC_Schedule *sch static inline int bcast_sched_knomial(int rank, int comm_size, int root, NBC_Schedule *schedule, void *buf, int count, MPI_Datatype datatype, int knomial_radix); +static int libnbc_ibcast_knomial_radix; +static bool libnbc_ibcast_skip_dt_decision; + +static mca_base_var_enum_value_t ibcast_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "binomial"}, + {3, "chain"}, + {4, "knomial"}, + {0, NULL} +}; + +typedef enum { NBC_BCAST_LINEAR, NBC_BCAST_BINOMIAL, NBC_BCAST_CHAIN, NBC_BCAST_KNOMIAL } bcast_algorithm_t; + +/* The following are used by dynamic and forced rules */ + +/* this routine is called by the component only */ +/* module does not call this it calls the forced_getvalues routine instead */ + +int ompi_coll_libnbc_bcast_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[BCAST].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_ibcast_algorithms", ibcast_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ibcast_algorithm", + "Which ibcast algorithm is used: 0 ignore, 1 linear, 2 binomial, 3 chain, 4 knomial", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[BCAST].algorithm); + + mca_coll_libnbc_component.forced_params[BCAST].segsize = IBCAST_DEFAULT_SEGSIZE; + mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ibcast_algorithm_segmentsize", + "Segment size in bytes used by default for ibcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[BCAST].segsize); + + libnbc_ibcast_knomial_radix = IBCAST_DEFAULT_RADIX; + + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ibcast_knomial_radix", "k-nomial tree radix for the ibcast algorithm (radix > 1)", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &libnbc_ibcast_knomial_radix); + /* ibcast decision function can make the wrong decision if a legal + * non-uniform data type signature is used. This has resulted in the + * collective operation failing, and possibly producing wrong answers. + * We are investigating a fix for this problem, but it is taking a while. + * https://github.com/open-mpi/ompi/issues/2256 + * https://github.com/open-mpi/ompi/issues/1763 + * As a result we are adding an MCA parameter to make a conservative + * decision to avoid this issue. If the user knows that their application + * does not use data types in this way, then they can set this parameter + * to get the old behavior. Once the issue is truely fixed, then this + * parameter can be removed. + */ + libnbc_ibcast_skip_dt_decision = true; + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ibcast_skip_dt_decision", + "In ibcast only use size of communicator to choose algorithm, exclude data type signature. Set to 'false' to use data type signature in decision. WARNING: If you set this to 'false' then your application should not use non-uniform data type signatures in calls to ibcast.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &libnbc_ibcast_skip_dt_decision); + + + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Bcast_args_compare(NBC_Bcast_args *a, NBC_Bcast_args *b, void *param) { @@ -47,17 +127,47 @@ int NBC_Bcast_args_compare(NBC_Bcast_args *a, NBC_Bcast_args *b, void *param) { } #endif +static bcast_algorithm_t nbc_bcast_default_algorithm(int p, size_t size, int count, + int* segsize) +{ + bcast_algorithm_t alg; + *segsize = IBCAST_DEFAULT_SEGSIZE; + + if( libnbc_ibcast_skip_dt_decision ) { + if (p <= 4) { + alg = NBC_BCAST_LINEAR; + } + else { + alg = NBC_BCAST_BINOMIAL; + } + } + else { + if (p <= 4) { + alg = NBC_BCAST_LINEAR; + } else if (size * count < 65536) { + alg = NBC_BCAST_BINOMIAL; + } else if (size * count < 524288) { + alg = NBC_BCAST_CHAIN; + *segsize = 8192; + } else { + alg = NBC_BCAST_CHAIN; + *segsize = 32768; + } + } + return alg; +} + static int nbc_bcast_init(void *buffer, int count, MPI_Datatype datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { - int rank, p, res, segsize; + int rank, p, res, segsize, radix; size_t size; NBC_Schedule *schedule; #ifdef NBC_CACHE_SCHEDULE NBC_Bcast_args *args, *found, search; #endif - enum { NBC_BCAST_LINEAR, NBC_BCAST_BINOMIAL, NBC_BCAST_CHAIN, NBC_BCAST_KNOMIAL } alg; + bcast_algorithm_t alg; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rank = ompi_comm_rank (comm); @@ -73,43 +183,35 @@ static int nbc_bcast_init(void *buffer, int count, MPI_Datatype datatype, int ro return res; } - segsize = 16384; - /* algorithm selection */ - if (libnbc_ibcast_algorithm == 0) { - if( libnbc_ibcast_skip_dt_decision ) { - if (p <= 4) { - alg = NBC_BCAST_LINEAR; - } - else { - alg = NBC_BCAST_BINOMIAL; - } - } - else { - if (p <= 4) { - alg = NBC_BCAST_LINEAR; - } else if (size * count < 65536) { - alg = NBC_BCAST_BINOMIAL; - } else if (size * count < 524288) { - alg = NBC_BCAST_CHAIN; - segsize = 8192; - } else { - alg = NBC_BCAST_CHAIN; - segsize = 32768; - } - } - } else { - /* user forced dynamic decision */ - if (libnbc_ibcast_algorithm == 1) { - alg = NBC_BCAST_LINEAR; - } else if (libnbc_ibcast_algorithm == 2) { - alg = NBC_BCAST_BINOMIAL; - } else if (libnbc_ibcast_algorithm == 3) { - alg = NBC_BCAST_CHAIN; - } else if (libnbc_ibcast_algorithm == 4 && libnbc_ibcast_knomial_radix > 1) { - alg = NBC_BCAST_KNOMIAL; + if (libnbc_module->com_rules[BCAST]) { + int algorithm, dummy; + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[BCAST], + size * count, &radix, &segsize, &dummy); + if (algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ } else { - alg = NBC_BCAST_LINEAR; + /* get default algorithm ID but keep our segsize value */ + alg = nbc_bcast_default_algorithm(p, size, count, &dummy); } + } else if (0 != mca_coll_libnbc_component.forced_params[BCAST].algorithm) { + alg = mca_coll_libnbc_component.forced_params[BCAST].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + segsize = mca_coll_libnbc_component.forced_params[BCAST].segsize; + radix = libnbc_ibcast_knomial_radix; + } else { + alg = nbc_bcast_default_algorithm(p, size, count, &segsize); + radix = libnbc_ibcast_knomial_radix; + } + + if (NBC_BCAST_KNOMIAL == alg && radix <= 1) { + alg = NBC_BCAST_LINEAR; + } + + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc ibcast : algorithm %d segmentsize %d radix %d", + alg + 1, segsize, radix); + + if(0 == segsize) { + segsize = count * size; /* only one frag */ } #ifdef NBC_CACHE_SCHEDULE @@ -137,7 +239,7 @@ static int nbc_bcast_init(void *buffer, int count, MPI_Datatype datatype, int ro res = bcast_sched_chain(rank, p, root, schedule, buffer, count, datatype, segsize, size); break; case NBC_BCAST_KNOMIAL: - res = bcast_sched_knomial(rank, p, root, schedule, buffer, count, datatype, libnbc_ibcast_knomial_radix); + res = bcast_sched_knomial(rank, p, root, schedule, buffer, count, datatype, radix); break; } diff --git a/ompi/mca/coll/libnbc/nbc_iexscan.c b/ompi/mca/coll/libnbc/nbc_iexscan.c index 547da001dc1..97dec640e1b 100644 --- a/ompi/mca/coll/libnbc/nbc_iexscan.c +++ b/ompi/mca/coll/libnbc/nbc_iexscan.c @@ -11,6 +11,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,6 +33,37 @@ static inline int exscan_sched_recursivedoubling( int count, MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2); +static mca_base_var_enum_value_t iexscan_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "recursive_doubling"}, + {0, NULL} +}; + + +/* The following are used by dynamic and forced rules */ + +/* this routine is called by the component only */ +/* module does not call this it calls the forced_getvalues routine instead */ + +int ompi_coll_libnbc_exscan_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[EXSCAN].algorithm = 0; + + (void) mca_base_var_enum_create("coll_libnbc_iexscan_algorithms", iexscan_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "iexscan_algorithm", + "Which iexscan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[EXSCAN].algorithm); + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { @@ -71,16 +103,34 @@ static int nbc_exscan_init(const void* sendbuf, void* recvbuf, int count, MPI_Da return nbc_get_noop_request(persistent, request); } + if (libnbc_module->com_rules[EXSCAN]) { + /* compute data size to choose correct rule */ + size_t dsize; + ompi_datatype_type_size (datatype, &dsize); + dsize *= count; + /* get algorithm */ + int algorithm, dummy1, dummy2, dummy3; + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[EXSCAN], + dsize, &dummy1, &dummy2, &dummy3); + if (algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_EXSCAN_LINEAR; /* default */ + } + } else if (0 != mca_coll_libnbc_component.forced_params[EXSCAN].algorithm) { + alg = mca_coll_libnbc_component.forced_params[EXSCAN].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_EXSCAN_LINEAR; /* default */ + } + span = opal_datatype_span(&datatype->super, count, &gap); - if (libnbc_iexscan_algorithm == 2) { - alg = NBC_EXSCAN_RDBL; + if (NBC_EXSCAN_RDBL == alg) { ptrdiff_t span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t); tmpbuf = malloc(span_align + span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } tmpbuf1 = (void *)(-gap); tmpbuf2 = (char *)(span_align) - gap; } else { - alg = NBC_EXSCAN_LINEAR; if (rank > 0) { tmpbuf = malloc(span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } diff --git a/ompi/mca/coll/libnbc/nbc_ireduce.c b/ompi/mca/coll/libnbc/nbc_ireduce.c index da50f1eb276..22d3d5f5584 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce.c @@ -11,6 +11,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,6 +27,8 @@ #include "nbc_internal.h" +#define IREDUCE_DEFAULT_SEGSIZE 8192 + static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, char tmpredbuf, int count, MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf); static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, @@ -38,6 +41,47 @@ static inline int red_sched_redscat_gather( char tmpredbuf, int count, MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmp_buf, struct ompi_communicator_t *comm); +typedef enum { NBC_RED_BINOMIAL, NBC_RED_CHAIN, NBC_RED_REDSCAT_GATHER} reduce_algorithm_t; + +static mca_base_var_enum_value_t ireduce_algorithms[] = { + {0, "ignore"}, + {1, "chain"}, + {2, "binomial"}, + {3, "rabenseifner"}, + {0, NULL} +}; + +/* The following are used by dynamic and forced rules */ + +/* this routine is called by the component only */ +/* module does not call this it calls the forced_getvalues routine instead */ + +int ompi_coll_libnbc_reduce_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[REDUCE].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_ireduce_algorithms", ireduce_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ireduce_algorithm", + "Which ireduce algorithm is used: 0 ignore, 1 chain, 2 binomial, 3 rabenseifner", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[REDUCE].algorithm); + + mca_coll_libnbc_component.forced_params[REDUCE].segsize = IREDUCE_DEFAULT_SEGSIZE; + mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ireduce_algorithm_segmentsize", + "Segment size in bytes used by default for ireduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[REDUCE].segsize); + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Reduce_args_compare(NBC_Reduce_args *a, NBC_Reduce_args *b, void *param) { @@ -58,6 +102,18 @@ int NBC_Reduce_args_compare(NBC_Reduce_args *a, NBC_Reduce_args *b, void *param) } #endif +static reduce_algorithm_t nbc_reduce_default_algorithm(int p, size_t size, int count, + MPI_Op op, int nprocs_pof2) +{ + if (ompi_op_is_commute(op) && p > 2 && count >= nprocs_pof2) { + return NBC_RED_REDSCAT_GATHER; + } else if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) { + return NBC_RED_BINOMIAL; + } else { + return NBC_RED_CHAIN; + } +} + /* the non-blocking reduce */ static int nbc_reduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, @@ -102,27 +158,30 @@ static int nbc_reduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Da } span = opal_datatype_span(&datatype->super, count, &gap); + int nprocs_pof2 = opal_next_poweroftwo(p) >> 1; /* algorithm selection */ - int nprocs_pof2 = opal_next_poweroftwo(p) >> 1; - if (libnbc_ireduce_algorithm == 0) { - if (ompi_op_is_commute(op) && p > 2 && count >= nprocs_pof2) { - alg = NBC_RED_REDSCAT_GATHER; - } else if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) { - alg = NBC_RED_BINOMIAL; + if (libnbc_module->com_rules[REDUCE]) { + int algorithm,dummy1,dummy2; + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[REDUCE], + size * count, &dummy1, &segsize, &dummy2); + if (algorithm) { + alg = algorithm - 1;/* -1 is to shift from algorithm ID to enum */ } else { - alg = NBC_RED_CHAIN; + alg = nbc_reduce_default_algorithm(p, size, count, op, nprocs_pof2); } + } else if (0 != mca_coll_libnbc_component.forced_params[REDUCE].algorithm) { + alg = mca_coll_libnbc_component.forced_params[REDUCE].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + segsize = mca_coll_libnbc_component.forced_params[REDUCE].segsize; } else { - if (libnbc_ireduce_algorithm == 1) { - alg = NBC_RED_CHAIN; - } else if (libnbc_ireduce_algorithm == 2) { - alg = NBC_RED_BINOMIAL; - } else if (libnbc_ireduce_algorithm == 3 && ompi_op_is_commute(op) && p > 2 && count >= nprocs_pof2) { - alg = NBC_RED_REDSCAT_GATHER; - } else { - alg = NBC_RED_CHAIN; - } + /* default */ + alg = nbc_reduce_default_algorithm(p, size, count, op, nprocs_pof2); + segsize = IREDUCE_DEFAULT_SEGSIZE; + } + + if (NBC_RED_REDSCAT_GATHER == alg && (!ompi_op_is_commute(op) || p <= 2 || count < nprocs_pof2)) { + alg = NBC_RED_CHAIN; + segsize = IREDUCE_DEFAULT_SEGSIZE; } /* allocate temporary buffers */ @@ -140,9 +199,14 @@ static int nbc_reduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Da } } else { tmpbuf = malloc (span); - segsize = 16384/2; } + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc ireduce : algorithm %d segmentsize %d", + alg + 1, segsize); + if(0 == segsize) { + segsize = count * size; /* only one frag */ + } if (OPAL_UNLIKELY(NULL == tmpbuf)) { return OMPI_ERR_OUT_OF_RESOURCE; } diff --git a/ompi/mca/coll/libnbc/nbc_iscan.c b/ompi/mca/coll/libnbc/nbc_iscan.c index ccc531d669e..317a1c537cf 100644 --- a/ompi/mca/coll/libnbc/nbc_iscan.c +++ b/ompi/mca/coll/libnbc/nbc_iscan.c @@ -11,6 +11,7 @@ * reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,6 +33,36 @@ static inline int scan_sched_recursivedoubling( int count, MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2); +static mca_base_var_enum_value_t iscan_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "recursive_doubling"}, + {0, NULL} +}; + +/* The following are used by dynamic and forced rules */ + +/* this routine is called by the component only */ +/* module does not call this it calls the forced_getvalues routine instead */ + +int ompi_coll_libnbc_scan_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + (void) mca_base_var_enum_create("coll_libnbc_iscan_algorithms", iscan_algorithms, &new_enum); + + mca_coll_libnbc_component.forced_params[SCAN].algorithm = 0; + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "iscan_algorithm", + "Which scan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[SCAN].algorithm); + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { @@ -71,16 +102,34 @@ static int nbc_scan_init(const void* sendbuf, void* recvbuf, int count, MPI_Data return nbc_get_noop_request(persistent, request); } + if (libnbc_module->com_rules[SCAN]) { + /* compute data size to choose correct rule */ + size_t dsize; + ompi_datatype_type_size (datatype, &dsize); + dsize *= count; + /* get algorithm */ + int algorithm, dummy1, dummy2, dummy3; + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[SCAN], + dsize, &dummy1, &dummy2, &dummy3); + if (algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_SCAN_LINEAR; /* default */ + } + } else if (0 != mca_coll_libnbc_component.forced_params[SCAN].algorithm) { + alg = mca_coll_libnbc_component.forced_params[SCAN].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_SCAN_LINEAR; /* default */ + } + span = opal_datatype_span(&datatype->super, count, &gap); - if (libnbc_iscan_algorithm == 2) { - alg = NBC_SCAN_RDBL; + if (NBC_SCAN_RDBL == alg) { ptrdiff_t span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t); tmpbuf = malloc(span_align + span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } tmpbuf1 = (void *)(-gap); tmpbuf2 = (char *)(span_align) - gap; } else { - alg = NBC_SCAN_LINEAR; if (rank > 0) { tmpbuf = malloc(span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; }