diff --git a/.mailmap b/.mailmap index 4ad3e4cbedb..9dc3063d881 100644 --- a/.mailmap +++ b/.mailmap @@ -113,3 +113,6 @@ Anandhi S Jayakumar Mohan Gandhi Harumi Kuno + +GONCALVES, Thomas Thomas Goncalves +GONCALVES, Thomas GONCALVES, THOMAS diff --git a/ompi/mca/coll/base/Makefile.am b/ompi/mca/coll/base/Makefile.am index e513dce6049..573634be3fe 100644 --- a/ompi/mca/coll/base/Makefile.am +++ b/ompi/mca/coll/base/Makefile.am @@ -20,6 +20,8 @@ dist_ompidata_DATA = base/help-mca-coll-base.txt headers += \ base/base.h \ + base/coll_base_dynamic_file.h \ + base/coll_base_dynamic_rules.h \ base/coll_tags.h \ base/coll_base_topo.h \ base/coll_base_util.h \ @@ -36,6 +38,8 @@ libmca_coll_la_SOURCES += \ base/coll_base_allgather.c \ base/coll_base_allgatherv.c \ base/coll_base_util.c \ + base/coll_base_dynamic_file.c \ + base/coll_base_dynamic_rules.c \ base/coll_base_allreduce.c \ base/coll_base_alltoall.c \ base/coll_base_gather.c \ diff --git a/ompi/mca/coll/base/coll_base_dynamic_file.c b/ompi/mca/coll/base/coll_base_dynamic_file.c new file mode 100644 index 00000000000..dead3b47ba1 --- /dev/null +++ b/ompi/mca/coll/base/coll_base_dynamic_file.c @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2020 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include +#include + +#include "mpi.h" +#include "ompi/mca/mca.h" +#include "base.h" + +/* need to include our own topo prototypes so we can malloc data on the comm correctly */ +#include "coll_base_topo.h" + +/* need file reading function */ +#include "coll_base_util.h" + +/* also need the dynamic rule structures */ +#include "coll_base_dynamic_rules.h" + +/* and our own prototypes */ +#include "coll_base_dynamic_file.h" + +static int fileline=0; /* used for verbose error messages */ + +#define getnext(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval) + +/* + * Reads a rule file called fname + * Builds the algorithm rule table for a max of n_collectives + * + * If an error occurs it removes rule table and then exits with a very verbose + * error message (this stops the user using a half baked rule table + * + * Returns the number of actual collectives that a rule exists for + * (note 0 is NOT an error) + * + */ + +int ompi_coll_base_read_rules_config_file (char *fname, int format_version, ompi_coll_base_alg_rule_t** rules, int n_collectives) +{ + FILE *fptr = (FILE*) NULL; + long X, CI, NCS, CS, ALG, NMS, FANINOUT, nodes_rules_number, nodes_number; + long MS, SS; + int x, ncs, nms, node_rule; + int ret = OMPI_SUCCESS; + + ompi_coll_base_alg_rule_t *alg_rules = (ompi_coll_base_alg_rule_t*) NULL; /* complete table of rules */ + + /* individual pointers to sections of rules */ + ompi_coll_base_alg_rule_t *alg_p = (ompi_coll_base_alg_rule_t*) NULL; + ompi_coll_base_nodes_rule_t *nodes_p = (ompi_coll_base_nodes_rule_t*) NULL; + ompi_coll_base_com_rule_t *com_p = (ompi_coll_base_com_rule_t*) NULL; + ompi_coll_base_msg_rule_t *msg_p = (ompi_coll_base_msg_rule_t*) NULL; + + /* stats info */ + int total_alg_count = 0; + int total_nodes_count = 0; + int total_com_count = 0; + int total_msg_count = 0; + + if (!fname) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Gave NULL as rule table configuration file for base collectives... ignoring!\n"); + ret = -1; + goto on_file_error; + } + + if (!rules) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Gave NULL as rule table result ptr!... ignoring!\n"); + ret = -2; + goto on_file_error; + } + + if (n_collectives<1) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Gave %d as max number of collectives in the rule table configuration file for base collectives!... ignoring!\n", n_collectives); + ret = -3; + goto on_file_error; + } + + fptr = fopen (fname, "r"); + if (!fptr) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"cannot read rules file [%s]\n", fname); + ret = -4; + goto on_file_error; + } + + switch (format_version) { + case COMM_MSG_FORMAT: + opal_output_verbose(10,ompi_coll_base_framework.framework_output,"Reading dynamic rule format %d : \n", COMM_MSG_FORMAT); + break; + case NODES_COMM_MSG_FORMAT: + opal_output_verbose(10,ompi_coll_base_framework.framework_output,"Reading dynamic rule format %d : \n", NODES_COMM_MSG_FORMAT); + break; + default: + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Format value %d is unknown\n", format_version); + ret = -5; + goto on_file_error; + break; + } + + /* make space and init the algorithm rules for each of the n_collectives MPI collectives */ + alg_rules = ompi_coll_base_mk_alg_rules (n_collectives); + if (NULL == alg_rules) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"cannot cannot allocate rules (n=%d) for file [%s]\n", n_collectives, fname); + ret = -6; + goto on_file_error; + } + + if( (getnext(fptr, &X) < 0) || (X < 0) ) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Could not read number of collectives in configuration file around line %d\n", fileline); + ret = -7; + goto on_file_error; + } + if (X>n_collectives) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline); + ret = -8; + goto on_file_error; + } + + for (x=0;x=n_collectives) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline); + ret = -8; + goto on_file_error; + } + + if (alg_rules[CI].alg_rule_id != CI) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output, "Internal error in handling collective ID %ld\n", CI); + ret = -9; + goto on_file_error; + } + opal_output_verbose(100,ompi_coll_base_framework.framework_output, "Reading dynamic rule for collective ID %ld\n", CI); + alg_p = &alg_rules[CI]; + + alg_p->alg_rule_id = CI; + /* If format specify a nodes number */ + if (NODES_COMM_MSG_FORMAT == format_version) { + if (getnext (fptr, &nodes_rules_number) < 0 || nodes_rules_number < 0) { ; + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Could not read count of nodes number for collective ID %ld at around line %d\n", CI, fileline); + ret = -7; + goto on_file_error; + } + alg_p->n_nodes_sizes = nodes_rules_number; + } + else { + nodes_rules_number = 1; + alg_p->n_nodes_sizes = 1; + } + alg_p->nodes_rules = ompi_coll_base_mk_nodes_rules (nodes_rules_number, CI); + + for (node_rule = 0 ; node_rule < nodes_rules_number ; node_rule++) { /* for each nodes number */ + + nodes_p = &(alg_p->nodes_rules[node_rule]); + + if (NODES_COMM_MSG_FORMAT == format_version) { + if ( (getnext (fptr, &nodes_number) < 0) || (nodes_number < 0)){ + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Could not read nodes number for collective ID %ld node rule %d at around line %d\n", CI, node_rule, fileline); + ret = -7; + goto on_file_error; + } + nodes_p->nodes_number = nodes_number; + } + else { + nodes_p->nodes_number = 1; /* Only one configuration - all cases */ + } + + if( (getnext (fptr, &NCS) < 0) || (NCS < 0) ) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Could not read count of communicators for collective ID %ld node rule %d at around line %d\n", CI, node_rule, fileline); + ret = -7; + goto on_file_error; + } + opal_output_verbose(100,ompi_coll_base_framework.framework_output, "Read communicator count %ld for dynamic rule for collective ID %ld with node count %d\n", NCS, CI, nodes_p->nodes_number); + nodes_p->n_com_sizes = NCS; + nodes_p->com_rules = ompi_coll_base_mk_com_rules (NCS, CI, node_rule); + + for (ncs=0;ncscom_rules[ncs]); + + if( (getnext (fptr, &CS) < 0) || (CS < 0) ) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline); + ret = -7; + goto on_file_error; + } + + com_p->mpi_comsize = CS; + + if( (getnext (fptr, &NMS) < 0) || (NMS < 0) ) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline); + ret = -7; + goto on_file_error; + } + opal_output_verbose(100,ompi_coll_base_framework.framework_output, "Read message count %ld for dynamic rule for collective ID %ld and comm size %ld\n", + NMS, CI, CS); + com_p->n_msg_sizes = NMS; + com_p->msg_rules = ompi_coll_base_mk_msg_rules (NMS, CI, node_rule, ncs, CS); + + for (nms=0;nmsmsg_rules[nms]); + + if( (getnext (fptr, &MS) < 0) || (MS < 0) ) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline); + ret = -7; + goto on_file_error; + } + msg_p->msg_size = (size_t)MS; + + if( (getnext (fptr, &ALG) < 0) || (ALG < 0) ) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline); + ret = -7; + goto on_file_error; + } + msg_p->result_alg = ALG; + + if( (getnext (fptr, &FANINOUT) < 0) || (FANINOUT < 0) ) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline); + ret = -7; + goto on_file_error; + } + msg_p->result_topo_faninout = FANINOUT; + + if( (getnext (fptr, &SS) < 0) || (SS < 0) ) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline); + ret = -7; + goto on_file_error; + } + msg_p->result_segsize = SS; + + if (!nms && MS) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"All algorithms must specify a rule for message size of zero upwards always first!\n"); + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Message size was %lu for collective ID %ld com rule %ld msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline); + ret = -7; + goto on_file_error; + } + + total_msg_count++; + + } /* msg size */ + + total_com_count++; + + } /* comm size */ + + total_nodes_count++; + + } /* nodes number */ + total_alg_count++; + opal_output_verbose(100,ompi_coll_base_framework.framework_output, "Done reading dynamic rule for collective ID %ld\n", CI); + + } /* per collective */ + + ret = total_alg_count; + fclose (fptr); + + opal_output_verbose(10,ompi_coll_base_framework.framework_output,"\nConfigure file Stats\n"); + opal_output_verbose(10,ompi_coll_base_framework.framework_output,"Collectives with rules\t\t\t: %5d\n", total_alg_count); + opal_output_verbose(10,ompi_coll_base_framework.framework_output,"Nodes count with rules\t\t\t: %5d\n", total_nodes_count); + opal_output_verbose(10,ompi_coll_base_framework.framework_output,"Communicator sizes with rules\t\t: %5d\n", total_com_count); + opal_output_verbose(10,ompi_coll_base_framework.framework_output,"Message sizes with rules\t\t\t: %5d\n", total_msg_count); + opal_output_verbose(10,ompi_coll_base_framework.framework_output,"Lines in configuration file read\t\t: %5d\n", fileline); + + /* return the rules to the caller */ + *rules = alg_rules; + + return ret; + + + on_file_error: + + /* here we close out the file and delete any memory allocated nicely */ + /* we return back a verbose message and a count of -1 algorithms read */ + /* draconian but its better than having a bad collective decision table */ + + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"read_rules_config_file: bad configure file [%s] which considered format is %d. Read afar as line %d\n" + , fname, format_version, fileline); + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Ignoring user supplied base collectives configuration decision file.\n"); + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Switching back to [compiled in] fixed decision table.\n"); + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Fix errors as listed above and try again.\n"); + + /* deallocate memory if allocated */ + if (alg_rules) ompi_coll_base_free_all_rules (alg_rules, n_collectives); + + /* close file */ + if (fptr) fclose (fptr); + + *rules = (ompi_coll_base_alg_rule_t*) NULL; + return ret; +} + diff --git a/ompi/mca/coll/base/coll_base_dynamic_file.h b/ompi/mca/coll/base/coll_base_dynamic_file.h new file mode 100644 index 00000000000..96c1d0ee218 --- /dev/null +++ b/ompi/mca/coll/base/coll_base_dynamic_file.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2020 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_BASE_DYNAMIC_FILE_H_HAS_BEEN_INCLUDED +#define MCA_COLL_BASE_DYNAMIC_FILE_H_HAS_BEEN_INCLUDED + +#include "ompi_config.h" + +/* also need the dynamic rule structures */ +#include "coll_base_dynamic_rules.h" + + +BEGIN_C_DECLS + +int ompi_coll_base_read_rules_config_file (char *fname, int format_version, ompi_coll_base_alg_rule_t** rules, int n_collectives); + + +END_C_DECLS +#endif /* MCA_COLL_BASE_DYNAMIC_FILE_H_HAS_BEEN_INCLUDED */ + + diff --git a/ompi/mca/coll/base/coll_base_dynamic_rules.c b/ompi/mca/coll/base/coll_base_dynamic_rules.c new file mode 100644 index 00000000000..5173dea3a2f --- /dev/null +++ b/ompi/mca/coll/base/coll_base_dynamic_rules.c @@ -0,0 +1,482 @@ +/* + * Copyright (c) 2020 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/mca/mca.h" +#include "base.h" +#include "ompi/constants.h" + +/* need to include our own topo prototypes so we can malloc data on the comm correctly */ +#include "coll_base_topo.h" + +/* also need the dynamic rule structures */ +#include "coll_base_dynamic_rules.h" + +#include +#include + +#include "coll_base_util.h" + + +ompi_coll_base_alg_rule_t* ompi_coll_base_mk_alg_rules (int n_alg) +{ + int i; + ompi_coll_base_alg_rule_t* alg_rules; + + alg_rules = (ompi_coll_base_alg_rule_t *) calloc (n_alg, sizeof (ompi_coll_base_alg_rule_t)); + if (!alg_rules) return (alg_rules); + + /* set all we can at this point */ + for (i=0;ialg_rule_id, + msg_p->node_rule_id, msg_p->com_rule_id, msg_p->mpi_comsize, msg_p->msg_rule_id); + + opal_output_verbose(50,ompi_coll_base_framework.framework_output,"msg_size %10lu -> algorithm %2d\ttopo in/out %2d\tsegsize %5ld\tmax_requests %4d\n", + msg_p->msg_size, msg_p->result_alg, msg_p->result_topo_faninout, msg_p->result_segsize, + msg_p->result_max_requests); + + return (0); +} + + +int ompi_coll_base_dump_com_rule (ompi_coll_base_com_rule_t* com_p) +{ + int i; + + if (!com_p) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Com rule was a NULL ptr?!\n"); + return (-1); + } + + opal_output_verbose(50,ompi_coll_base_framework.framework_output, "alg_id %3d\tnode_id %3d\tcom_id %3d\tcom_size %3d\t", com_p->alg_rule_id, com_p->node_rule_id, com_p->com_rule_id, com_p->mpi_comsize); + + if (!com_p->n_msg_sizes) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"no msgsizes defined\n"); + return (0); + } + int rc=0; + opal_output_verbose(50,ompi_coll_base_framework.framework_output,"number of message sizes %3d\n", com_p->n_msg_sizes); + + for (i=0;in_msg_sizes;i++) { + rc += ompi_coll_base_dump_msg_rule (&(com_p->msg_rules[i])); + } + + return rc; +} + + +int ompi_coll_base_dump_nodes_rule (ompi_coll_base_nodes_rule_t* nodes_p) +{ + int i; + + if (!nodes_p) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Node rule was a NULL ptr?!\n"); + return (-1); + } + + opal_output_verbose(50,ompi_coll_base_framework.framework_output,"alg_id %3d\tnode_id %3d", nodes_p->alg_rule_id, nodes_p->node_rule_id); + + if (!nodes_p->n_com_sizes) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"no coms defined\n"); + return (0); + } + int rc=0; + opal_output_verbose(50,ompi_coll_base_framework.framework_output,"number of com sizes %3d\n", nodes_p->n_com_sizes); + + for (i=0;in_com_sizes;i++) { + rc += ompi_coll_base_dump_com_rule (&(nodes_p->com_rules[i])); + } + + return rc; +} + + +int ompi_coll_base_dump_alg_rule (ompi_coll_base_alg_rule_t* alg_p) +{ + int i; + + if (!alg_p) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Algorithm rule was a NULL ptr?!\n"); + return (-1); + } + + opal_output_verbose(50,ompi_coll_base_framework.framework_output,"alg_id %3d\t", alg_p->alg_rule_id); + + if (!alg_p->n_nodes_sizes) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"no nodes defined\n"); + return (0); + } + int rc=0; + opal_output_verbose(50,ompi_coll_base_framework.framework_output,"number of nodes sizes %3d\n", alg_p->n_nodes_sizes); + + for (i=0;in_nodes_sizes;i++) { + rc += ompi_coll_base_dump_nodes_rule (&(alg_p->nodes_rules[i])); + } + + return rc; +} + + +int ompi_coll_base_dump_all_rules (ompi_coll_base_alg_rule_t* alg_p, int n_rules) +{ + int i; + + if (!alg_p) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"Algorithm rule was a NULL ptr?!\n"); + return (-1); + } + int rc=0; + + opal_output_verbose(50,ompi_coll_base_framework.framework_output,"Number of algorithm rules %3d\n", n_rules); + + for (i=0;in_msg_sizes) { + ompi_coll_base_msg_rule_t* msg_p; + msg_p = com_p->msg_rules; + + if (!msg_p) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"attempt to free NULL msg_rules when msg count was %d\n", com_p->n_msg_sizes); + rc = -1; /* some error */ + } + else { + /* ok, memory exists for the msg rules so free that first */ + free (com_p->msg_rules); + com_p->msg_rules = (ompi_coll_base_msg_rule_t*) NULL; + com_p->n_msg_sizes = 0; + } + + } /* if we have msg rules to free as well */ + + return (rc); +} + + +int ompi_coll_base_free_coms_rules_in_node_rule (ompi_coll_base_nodes_rule_t* nodes_p) +{ + int rc=0; + + if (!nodes_p) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"attempt to free NULL node_rule ptr\n"); + return (-1); + } + + if (nodes_p->n_com_sizes) { + ompi_coll_base_com_rule_t* com_p; + com_p = nodes_p->com_rules; + + if (!com_p) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"attempt to free NULL com_rules when com count was %d\n", nodes_p->n_com_sizes); + } else { + int i; + /* ok, memory exists for the com rules so free their message rules first */ + for( i = 0; i < nodes_p->n_com_sizes; i++ ) { + com_p = &(nodes_p->com_rules[i]); + ompi_coll_base_free_msg_rules_in_com_rule (com_p); + } + /* we are now free to free the com rules themselives */ + free (nodes_p->com_rules); + nodes_p->com_rules = (ompi_coll_base_com_rule_t*) NULL; + nodes_p->n_com_sizes = 0; + } + + } /* if we have msg rules to free as well */ + + return (rc); +} + +int ompi_coll_base_free_nodes_rules_in_alg_rule (ompi_coll_base_alg_rule_t* alg_p) +{ + int rc=0; + + if (!alg_p) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"attempt to free NULL alg_rule ptr\n"); + return (-1); + } + + if (alg_p->n_nodes_sizes) { + ompi_coll_base_nodes_rule_t* nodes_p; + nodes_p = alg_p->nodes_rules; + + if (!nodes_p) { + opal_output_verbose(1,ompi_coll_base_framework.framework_output,"attempt to free NULL nodes_rules when nodes count was %d\n", alg_p->n_nodes_sizes); + } else { + int i; + /* ok, memory exists for the nodes rules so free their message rules first */ + for( i = 0; i < alg_p->n_nodes_sizes; i++ ) { + nodes_p = &(alg_p->nodes_rules[i]); + ompi_coll_base_free_coms_rules_in_node_rule (nodes_p); + } + /* we are now free to free the nodes rules themselives */ + free (alg_p->nodes_rules); + alg_p->nodes_rules = (ompi_coll_base_nodes_rule_t*) NULL; + alg_p->n_nodes_sizes = 0; + } + + } /* if we have nodes rules to free as well */ + + return (rc); +} + +int ompi_coll_base_free_all_rules (ompi_coll_base_alg_rule_t* alg_p, int n_algs) +{ + int i; + int rc = 0; + + for( i = 0; i < n_algs; i++ ) { + rc += ompi_coll_base_free_nodes_rules_in_alg_rule (&(alg_p[i])); + } + + free (alg_p); + + return (rc); +} + +/* + * query functions + * i.e. the functions that get me the algorithm, topo fanin/out and segment size fast + * and also get the rules that are needed by each communicator as needed + * + */ + +/* + * This function is used to get the pointer to the nearest (less than or equal) + * com rule for this MPI collective (alg_id) for a given + * MPI communicator size. The complete rule base must be presented. + * + * If no rule exits returns NULL, else the com rule ptr + * (which can be used in the coll_base_get_target_method_params() call) + * + */ +ompi_coll_base_com_rule_t* ompi_coll_base_get_com_rule_ptr (ompi_coll_base_alg_rule_t* rules, int alg_id, int nodes_nb, int mpi_comsize) +{ + ompi_coll_base_alg_rule_t* alg_p = (ompi_coll_base_alg_rule_t*) NULL; + ompi_coll_base_nodes_rule_t* nodes_p = (ompi_coll_base_nodes_rule_t*) NULL; + ompi_coll_base_nodes_rule_t* best_nodes_p = (ompi_coll_base_nodes_rule_t*) NULL; + ompi_coll_base_com_rule_t* com_p = (ompi_coll_base_com_rule_t*) NULL; + ompi_coll_base_com_rule_t* best_com_p = (ompi_coll_base_com_rule_t*) NULL; + int i; + + if (!rules) { /* no rule base no resulting com rule */ + return ((ompi_coll_base_com_rule_t*)NULL); + } + + alg_p = &(rules[alg_id]); /* get the algorithm rule pointer */ + + if (!alg_p->n_nodes_sizes) { /* check for count of communicator nodes number */ + return ((ompi_coll_base_com_rule_t*)NULL); /* no nodes sizes so no rule */ + } + + best_nodes_p = NULL; + nodes_p = alg_p->nodes_rules; + i = 0; + + while( i < alg_p->n_nodes_sizes ) { + if (nodes_p->nodes_number > nodes_nb) { + break; + } + best_nodes_p = nodes_p; + /* go to the next entry */ + nodes_p++; + i++; + } + if(NULL == best_nodes_p) { /* Check if there is a valid node configuration */ + return ((ompi_coll_base_com_rule_t*)NULL); /* no config so no rule */ + } + opal_output_verbose(50,ompi_coll_base_framework.framework_output,"Selected the following node rule id %d nodes_nb %d (%d)\n", best_nodes_p->node_rule_id, best_nodes_p->nodes_number, nodes_nb); + + if (!best_nodes_p->n_com_sizes) { /* check for count of communicator sizes */ + return ((ompi_coll_base_com_rule_t*)NULL); /* no com sizes so no rule */ + } + + /* ok have some com sizes, now to find the one closest to my mpi_comsize */ + + /* make a copy of the first com rule */ + best_com_p = NULL; + com_p = best_nodes_p->com_rules; + i = 0; + + while( i < best_nodes_p->n_com_sizes ) { + if (com_p->mpi_comsize > mpi_comsize) { + break; + } + best_com_p = com_p; + /* go to the next entry */ + com_p++; + i++; + } + if(NULL == best_com_p) { /* Check if there is a valid node configuration */ + return ((ompi_coll_base_com_rule_t*)NULL); /* no config so no rule */ + } + + opal_output_verbose(50,ompi_coll_base_framework.framework_output,"Selected the following com rule id %d\n", best_com_p->com_rule_id); + ompi_coll_base_dump_com_rule (best_com_p); + + return (best_com_p); +} + +/* + * This function takes a com_rule ptr (from the communicators coll base data structure) + * (Which is chosen for a particular MPI collective) + * and a (total_)msg_size and it returns (0) and a algorithm to use and a recommended topo faninout and segment size + * all based on the user supplied rules + * + * Just like the above functions it uses a less than or equal msg size + * (hense config file must have a default defined for '0' if we reach this point) + * else if no rules match we return '0' + '0,0' or used fixed decision table with no topo chand and no segmentation + * of users data.. shame. + * + * On error return 0 so we default to fixed rules anyway :) + * + */ + +int ompi_coll_base_get_target_method_params (ompi_coll_base_com_rule_t* base_com_rule, size_t mpi_msgsize, int *result_topo_faninout, + int* result_segsize, int* max_requests) +{ + ompi_coll_base_msg_rule_t* msg_p = (ompi_coll_base_msg_rule_t*) NULL; + ompi_coll_base_msg_rule_t* best_msg_p = (ompi_coll_base_msg_rule_t*) NULL; + int i; + + /* No rule or zero rules */ + if( (NULL == base_com_rule) || (0 == base_com_rule->n_msg_sizes)) { + return (0); + } + + /* ok have some msg sizes, now to find the one closest to my mpi_msgsize */ + + /* make a copy of the first msg rule */ + best_msg_p = msg_p = base_com_rule->msg_rules; + i = 0; + + while (in_msg_sizes) { + if (msg_p->msg_size <= mpi_msgsize) { + best_msg_p = msg_p; + } + else { + break; + } + /* go to the next entry */ + msg_p++; + i++; + } + + opal_output_verbose(50,ompi_coll_base_framework.framework_output,"Selected the following msg rule id %d\n", best_msg_p->msg_rule_id); + ompi_coll_base_dump_msg_rule (best_msg_p); + + /* return the segment size */ + *result_topo_faninout = best_msg_p->result_topo_faninout; + + /* return the segment size */ + *result_segsize = best_msg_p->result_segsize; + + /* return the maximum requests */ + *max_requests = best_msg_p->result_max_requests; + + /* return the algorithm/method to use */ + return (best_msg_p->result_alg); +} diff --git a/ompi/mca/coll/base/coll_base_dynamic_rules.h b/ompi/mca/coll/base/coll_base_dynamic_rules.h new file mode 100644 index 00000000000..95af1cd7fc8 --- /dev/null +++ b/ompi/mca/coll/base/coll_base_dynamic_rules.h @@ -0,0 +1,103 @@ + +/* + * Copyright (c) 2020 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_BASE_DYNAMIC_RULES_H_HAS_BEEN_INCLUDED +#define MCA_COLL_BASE_DYNAMIC_RULES_H_HAS_BEEN_INCLUDED + +#include "ompi_config.h" + +BEGIN_C_DECLS + + +typedef struct base_msg_rule_s { + /* paranoid / debug */ + int mpi_comsize; /* which MPI comm size this is is for */ + int alg_rule_id; /* unique alg rule id */ + int node_rule_id; /* unique node rule id */ + int com_rule_id; /* unique com rule id */ + int msg_rule_id; /* unique msg rule id */ + + /* RULE */ + size_t msg_size; /* message size */ + + /* RESULT */ + int result_alg; /* result algorithm to use */ + int result_topo_faninout; /* result topology fan in/out to use (if applicable) */ + long result_segsize; /* result segment size to use */ + int result_max_requests; /* maximum number of outstanding requests (if applicable) */ +} ompi_coll_base_msg_rule_t; + + +typedef struct base_com_rule_s { + /* paranoid / debug */ + int mpi_comsize; /* which MPI comm size this is is for */ + int alg_rule_id; /* unique alg rule id */ + int node_rule_id; /* unique node rule id */ + int com_rule_id; /* unique com rule id */ + + /* RULE */ + int n_msg_sizes; + ompi_coll_base_msg_rule_t *msg_rules; + +} ompi_coll_base_com_rule_t; + +typedef struct base_nodes_rule_s { + /* paranoid / debug */ + int nodes_number; /* which MPI comm nodes number this is is for */ + int alg_rule_id; /* unique alg rule id */ + int node_rule_id; /* unique node rule id */ + + /* RULE */ + int n_com_sizes; + ompi_coll_base_com_rule_t *com_rules; +} ompi_coll_base_nodes_rule_t; + +typedef struct base_alg_rule_s { + /* paranoid / debug */ + int alg_rule_id; /* unique alg rule id */ + /* RULE */ + int n_nodes_sizes; + ompi_coll_base_nodes_rule_t *nodes_rules; +} ompi_coll_base_alg_rule_t; + +/* function prototypes */ + +/* these are used to build the rule tables (by the read file routines) */ +ompi_coll_base_alg_rule_t* ompi_coll_base_mk_alg_rules (int n_alg); +ompi_coll_base_nodes_rule_t* ompi_coll_base_mk_nodes_rules (int n_nodes_rules, int alg_rule_id); +ompi_coll_base_com_rule_t* ompi_coll_base_mk_com_rules (int n_com_rules, int alg_rule_id, int node_rule_id); +ompi_coll_base_msg_rule_t* ompi_coll_base_mk_msg_rules (int n_msg_rules, int alg_rule_id, int node_rule_id, int com_rule_id, int mpi_comsize); + +/* debugging support */ +int ompi_coll_base_dump_msg_rule (ompi_coll_base_msg_rule_t* msg_p); +int ompi_coll_base_dump_com_rule (ompi_coll_base_com_rule_t* com_p); +int ompi_coll_base_dump_nodes_rule (ompi_coll_base_nodes_rule_t* nodes_p); +int ompi_coll_base_dump_alg_rule (ompi_coll_base_alg_rule_t* alg_p); +int ompi_coll_base_dump_all_rules (ompi_coll_base_alg_rule_t* alg_p, int n_rules); + +/* free alloced memory routines, used by file and base component/module */ +int ompi_coll_base_free_msg_rules_in_com_rule (ompi_coll_base_com_rule_t* com_p); +int ompi_coll_base_free_coms_rules_in_node_rule (ompi_coll_base_nodes_rule_t* nodes_p); +int ompi_coll_base_free_nodes_rules_in_alg_rule (ompi_coll_base_alg_rule_t* alg_p); +int ompi_coll_base_free_all_rules (ompi_coll_base_alg_rule_t* alg_p, int n_algs); + + +/* the IMPORTANT routines, i.e. the ones that do stuff for everyday communicators and collective calls */ + +ompi_coll_base_com_rule_t* ompi_coll_base_get_com_rule_ptr (ompi_coll_base_alg_rule_t* rules, int alg_id, int nodes_nb, int mpi_comsize); + +int ompi_coll_base_get_target_method_params (ompi_coll_base_com_rule_t* base_com_rule, size_t mpi_msgsize, + int* result_topo_faninout, int* result_segsize, + int* max_requests); + + +END_C_DECLS +#endif /* MCA_COLL_BASE_DYNAMIC_RULES_H_HAS_BEEN_INCLUDED */ + diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index ffed201c414..2d13aa9c9a6 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -11,6 +11,7 @@ * All rights reserved. * Copyright (c) 2014-2020 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -558,3 +559,69 @@ const char* mca_coll_base_colltype_to_str(int collid) } return colltype_translation_table[collid]; } + +OBJ_CLASS_INSTANCE(ompi_coll_base_hostname_item_t, opal_list_item_t, NULL, NULL); + +int +ompi_coll_base_get_nnodes(struct ompi_communicator_t *comm) +{ + ompi_group_t* group; + ompi_proc_t *proc; + opal_list_t hostname_list; + ompi_coll_base_hostname_item_t *host_item, *next, *new_item; + bool already_in_the_list; + char* hostname; + int i, cmp, group_size, nodes_nb; + + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll get_nnodes called.")); + OBJ_CONSTRUCT(&hostname_list, opal_list_t); + /* Is inter communicator ? */ + if (OMPI_COMM_IS_INTER(comm)) { + group = comm->c_remote_group; + } else { + group = comm->c_local_group; + } + /* allocate an array of node id */ + group_size = ompi_group_size(group); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll get_nnodes group size %d.", group_size)); + /* For each rank */ + for (i=0 ; isuper); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll get_nnodes host %s.", hostname)); + /* Check if hostname is OK. */ + if(0 == strcmp(hostname,"unknown")) { + /* Do not consider this rank */ + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll get_nnodes ignoring unknown host.")); + continue; + } + already_in_the_list = false; + /* Try to find eleme in the list */ + /* List is sorted we stop if current elem is "higher" than the one we expect to find */ + /* In this case this element is inserted at this position. */ + /* If element is found, no insertion is performed, element is inserted at the end otherwise. */ + OPAL_LIST_FOREACH_SAFE(host_item, next, &hostname_list, ompi_coll_base_hostname_item_t) { + cmp = strcmp(host_item->hostname, hostname); + if(cmp > 0) { + /* no match found, insert at this position */ + break; + } + else if(0 == cmp) { + /* Found, do not insert this hostname */ + already_in_the_list = true; + break; + } + /* continue comparing elements otherwise */ + } + if (false == already_in_the_list) { + /* Insert a new element at current position */ + new_item = OBJ_NEW(ompi_coll_base_hostname_item_t); + new_item->hostname = hostname; + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll get_nnodes insert host %s", new_item->hostname)); + opal_list_insert_pos(&hostname_list, &host_item->super, &new_item->super); + } + } + nodes_nb = opal_list_get_size(&hostname_list); + OPAL_LIST_DESTRUCT(&hostname_list); + return nodes_nb; +} diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h index 46c95153469..c64c362758c 100644 --- a/ompi/mca/coll/base/coll_base_util.h +++ b/ompi/mca/coll/base/coll_base_util.h @@ -11,6 +11,7 @@ * All rights reserved. * Copyright (c) 2014-2020 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,6 +26,7 @@ #include "mpi.h" #include "ompi/mca/mca.h" +#include "ompi/mca/coll/coll.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/request/request.h" #include "ompi/communicator/communicator.h" @@ -97,6 +99,14 @@ struct mca_coll_base_avail_coll_t { typedef struct mca_coll_base_avail_coll_t mca_coll_base_avail_coll_t; OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_avail_coll_t); +/* hostname item used to get the number of nodes of a communicator */ +struct ompi_coll_base_hostname_item_t { + opal_list_item_t super; + char* hostname; +}; +typedef struct ompi_coll_base_hostname_item_t ompi_coll_base_hostname_item_t ; +OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_coll_base_hostname_item_t); + /** * A MPI_like function doing a send and a receive simultaneously. * If one of the communications results in a zero-byte message the @@ -189,6 +199,10 @@ int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expecte /* Miscelaneous function */ const char* mca_coll_base_colltype_to_str(int collid); int mca_coll_base_name_to_colltype(const char* name); +int ompi_coll_base_get_nnodes(struct ompi_communicator_t *comm); + +#define COMM_MSG_FORMAT 0 +#define NODES_COMM_MSG_FORMAT 1 END_C_DECLS #endif /* MCA_COLL_BASE_UTIL_EXPORT_H */ diff --git a/ompi/mca/coll/base/help-mca-coll-base.txt b/ompi/mca/coll/base/help-mca-coll-base.txt index d6e0071fa7a..0e2e1c86c2b 100644 --- a/ompi/mca/coll/base/help-mca-coll-base.txt +++ b/ompi/mca/coll/base/help-mca-coll-base.txt @@ -11,6 +11,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2020 Bull SAS. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -39,9 +40,11 @@ should be able to be chosen for any communicator. As such, this likely means that something else is wrong (although you should double check that the "basic", "libnbc" and "self" coll components are available on your system -- check the output of the "ompi_info" command). -#[comm-unselect:failed-finalize] +# +[comm-unselect:failed-finalize] A coll module failed to finalize properly when a communicator that was using it was destroyed. This is somewhat unusual: the module itself may be at fault, or this may be a symptom of another issue (e.g., a memory problem). +# diff --git a/ompi/mca/coll/libnbc/Makefile.am b/ompi/mca/coll/libnbc/Makefile.am index 4afa48cdd2c..382c7884c5d 100644 --- a/ompi/mca/coll/libnbc/Makefile.am +++ b/ompi/mca/coll/libnbc/Makefile.am @@ -22,6 +22,8 @@ # $HEADER$ # +dist_ompidata_DATA = help-mpi-coll-libnbc.txt + sources = \ coll_libnbc.h \ coll_libnbc_component.c \ diff --git a/ompi/mca/coll/libnbc/coll_libnbc.h b/ompi/mca/coll/libnbc/coll_libnbc.h index 3bdeb9419fa..08bbb35d20c 100644 --- a/ompi/mca/coll/libnbc/coll_libnbc.h +++ b/ompi/mca/coll/libnbc/coll_libnbc.h @@ -17,6 +17,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,6 +31,8 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_base_util.h" #include "opal/sys/atomic.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_base_dynamic_rules.h" BEGIN_C_DECLS @@ -69,14 +72,15 @@ BEGIN_C_DECLS /* number of implemented collective functions */ #define NBC_NUM_COLL 17 -extern bool libnbc_ibcast_skip_dt_decision; -extern int libnbc_iallgather_algorithm; -extern int libnbc_iallreduce_algorithm; -extern int libnbc_ibcast_algorithm; -extern int libnbc_ibcast_knomial_radix; -extern int libnbc_iexscan_algorithm; -extern int libnbc_ireduce_algorithm; -extern int libnbc_iscan_algorithm; +/* forced algorithm choices */ +/* this structure is for storing the indexes to the forced algorithm mca params... */ +/* we get these at component query (so that registered values appear in ompi_info) */ +struct coll_libnbc_force_algorithm_mca_param_indices_t { + int algorithm; /* which algorithm you want to force */ + int segsize; + int topo; +}; +typedef struct coll_libnbc_force_algorithm_mca_param_indices_t coll_libnbc_force_algorithm_mca_param_indices_t; struct ompi_coll_libnbc_component_t { mca_coll_base_component_2_0_0_t super; @@ -84,6 +88,13 @@ struct ompi_coll_libnbc_component_t { opal_list_t active_requests; opal_atomic_int32_t active_comms; opal_mutex_t lock; /* protect access to the active_requests list */ + int dynamic_rules_verbose; + int stream; + coll_libnbc_force_algorithm_mca_param_indices_t forced_params[COLLCOUNT]; + /* cached decision table stuff */ + ompi_coll_base_alg_rule_t *all_base_rules; + int dynamic_rules_fileformat; + char* dynamic_rules_filename; }; typedef struct ompi_coll_libnbc_component_t ompi_coll_libnbc_component_t; @@ -94,6 +105,9 @@ struct ompi_coll_libnbc_module_t { mca_coll_base_module_t super; opal_mutex_t mutex; bool comm_registered; + + /* the communicator rules for each MPI collective for ONLY my comsize */ + ompi_coll_base_com_rule_t *com_rules[COLLCOUNT]; #ifdef NBC_CACHE_SCHEDULE void *NBC_Dict[NBC_NUM_COLL]; /* this should point to a struct hb_tree, but since this is a @@ -160,6 +174,27 @@ int ompi_coll_libnbc_progress(void); int NBC_Init_comm(MPI_Comm comm, ompi_coll_libnbc_module_t *module); int NBC_Progress(NBC_Handle *handle); +int ompi_coll_libnbc_allgather_check_forced_init (void); +int ompi_coll_libnbc_allreduce_check_forced_init (void); +int ompi_coll_libnbc_alltoall_check_forced_init (void); +int ompi_coll_libnbc_alltoallv_check_forced_init (void); +int ompi_coll_libnbc_alltoallw_check_forced_init (void); +int ompi_coll_libnbc_barrier_check_forced_init (void); +int ompi_coll_libnbc_bcast_check_forced_init (void); +int ompi_coll_libnbc_exscan_check_forced_init (void); +int ompi_coll_libnbc_gather_check_forced_init (void); +int ompi_coll_libnbc_gatherv_check_forced_init (void); +int ompi_coll_libnbc_reduce_check_forced_init (void); +int ompi_coll_libnbc_reduce_scatter_check_forced_init (void); +int ompi_coll_libnbc_reduce_scatter_block_check_forced_init (void); +int ompi_coll_libnbc_scan_check_forced_init (void); +int ompi_coll_libnbc_scatter_check_forced_init (void); +int ompi_coll_libnbc_scatterv_check_forced_init (void); +int ompi_coll_libnbc_neighbor_allgather_check_forced_init (void); +int ompi_coll_libnbc_neighbor_allgatherv_check_forced_init (void); +int ompi_coll_libnbc_neighbor_alltoall_check_forced_init (void); +int ompi_coll_libnbc_neighbor_alltoallv_check_forced_init (void); +int ompi_coll_libnbc_neighbor_alltoallw_check_forced_init (void); int ompi_coll_libnbc_iallgather(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, diff --git a/ompi/mca/coll/libnbc/coll_libnbc_component.c b/ompi/mca/coll/libnbc/coll_libnbc_component.c index bcb0e06c2d9..95f896daa09 100644 --- a/ompi/mca/coll/libnbc/coll_libnbc_component.c +++ b/ompi/mca/coll/libnbc/coll_libnbc_component.c @@ -19,6 +19,7 @@ * Copyright (c) 2017 Ian Bradley Morgan and Anthony Skjellum. All * rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,6 +35,8 @@ #include "mpi.h" #include "ompi/mca/coll/coll.h" #include "ompi/communicator/communicator.h" +#include "ompi/mca/coll/base/coll_base_dynamic_file.h" +#include "opal/util/show_help.h" /* * Public string showing the coll ompi_libnbc component version number @@ -44,61 +47,6 @@ const char *mca_coll_libnbc_component_version_string = static int libnbc_priority = 10; static bool libnbc_in_progress = false; /* protect from recursive calls */ -bool libnbc_ibcast_skip_dt_decision = true; - -int libnbc_iallgather_algorithm = 0; /* iallgather user forced algorithm */ -static mca_base_var_enum_value_t iallgather_algorithms[] = { - {0, "ignore"}, - {1, "linear"}, - {2, "recursive_doubling"}, - {0, NULL} -}; - -int libnbc_iallreduce_algorithm = 0; /* iallreduce user forced algorithm */ -static mca_base_var_enum_value_t iallreduce_algorithms[] = { - {0, "ignore"}, - {1, "ring"}, - {2, "binomial"}, - {3, "rabenseifner"}, - {4, "recursive_doubling"}, - {0, NULL} -}; - -int libnbc_ibcast_algorithm = 0; /* ibcast user forced algorithm */ -int libnbc_ibcast_knomial_radix = 4; -static mca_base_var_enum_value_t ibcast_algorithms[] = { - {0, "ignore"}, - {1, "linear"}, - {2, "binomial"}, - {3, "chain"}, - {4, "knomial"}, - {0, NULL} -}; - -int libnbc_iexscan_algorithm = 0; /* iexscan user forced algorithm */ -static mca_base_var_enum_value_t iexscan_algorithms[] = { - {0, "ignore"}, - {1, "linear"}, - {2, "recursive_doubling"}, - {0, NULL} -}; - -int libnbc_ireduce_algorithm = 0; /* ireduce user forced algorithm */ -static mca_base_var_enum_value_t ireduce_algorithms[] = { - {0, "ignore"}, - {1, "chain"}, - {2, "binomial"}, - {3, "rabenseifner"}, - {0, NULL} -}; - -int libnbc_iscan_algorithm = 0; /* iscan user forced algorithm */ -static mca_base_var_enum_value_t iscan_algorithms[] = { - {0, "ignore"}, - {1, "linear"}, - {2, "recursive_doubling"}, - {0, NULL} -}; static int libnbc_open(void); static int libnbc_close(void); @@ -145,6 +93,38 @@ static int libnbc_open(void) { int ret; + if (mca_coll_libnbc_component.dynamic_rules_verbose > 0) { + mca_coll_libnbc_component.stream = opal_output_open(NULL); + opal_output_set_verbosity(mca_coll_libnbc_component.stream, mca_coll_libnbc_component.dynamic_rules_verbose); + } else { + mca_coll_libnbc_component.stream = -1; + } + if(mca_coll_libnbc_component.dynamic_rules_filename ) { + int rc; + opal_output_verbose(10, mca_coll_libnbc_component.stream , + "coll:libnbc:component_open Reading collective rules file [%s] which format is %d", + mca_coll_libnbc_component.dynamic_rules_filename, + mca_coll_libnbc_component.dynamic_rules_fileformat); + rc = ompi_coll_base_read_rules_config_file( mca_coll_libnbc_component.dynamic_rules_filename, + mca_coll_libnbc_component.dynamic_rules_fileformat, + &(mca_coll_libnbc_component.all_base_rules), COLLCOUNT); + if( rc >= 0 ) { + opal_output_verbose(10, mca_coll_libnbc_component.stream ,"coll:libnbc:module_open Read %d valid rules\n", rc); + if(ompi_coll_base_framework.framework_verbose >= 50) { + ompi_coll_base_dump_all_rules (mca_coll_libnbc_component.all_base_rules, COLLCOUNT); + } + } else { + opal_output_verbose(1, mca_coll_libnbc_component.stream ,"coll:libnbc:module_open Reading collective rules file failed\n"); + char error_name[12]; + sprintf(error_name,"file fail%1d", rc); + error_name[11] = '\0'; + opal_show_help("help-mpi-coll-libnbc.txt", (const char*)error_name, true, + mca_coll_libnbc_component.dynamic_rules_filename, mca_coll_libnbc_component.dynamic_rules_fileformat); + mca_coll_libnbc_component.all_base_rules = NULL; + } + } else { + mca_coll_libnbc_component.all_base_rules = NULL; + } OBJ_CONSTRUCT(&mca_coll_libnbc_component.requests, opal_free_list_t); OBJ_CONSTRUCT(&mca_coll_libnbc_component.active_requests, opal_list_t); @@ -173,6 +153,14 @@ libnbc_close(void) OBJ_DESTRUCT(&mca_coll_libnbc_component.active_requests); OBJ_DESTRUCT(&mca_coll_libnbc_component.lock); + if( NULL != mca_coll_libnbc_component.all_base_rules ) { + ompi_coll_base_free_all_rules(mca_coll_libnbc_component.all_base_rules, COLLCOUNT); + mca_coll_libnbc_component.all_base_rules = NULL; + } + /* close stream */ + if(mca_coll_libnbc_component.stream >= 0) { + opal_output_close(mca_coll_libnbc_component.stream); + } return OMPI_SUCCESS; } @@ -191,94 +179,42 @@ libnbc_register(void) MCA_BASE_VAR_SCOPE_READONLY, &libnbc_priority); - /* ibcast decision function can make the wrong decision if a legal - * non-uniform data type signature is used. This has resulted in the - * collective operation failing, and possibly producing wrong answers. - * We are investigating a fix for this problem, but it is taking a while. - * https://github.com/open-mpi/ompi/issues/2256 - * https://github.com/open-mpi/ompi/issues/1763 - * As a result we are adding an MCA parameter to make a conservative - * decision to avoid this issue. If the user knows that their application - * does not use data types in this way, then they can set this parameter - * to get the old behavior. Once the issue is truely fixed, then this - * parameter can be removed. - */ - libnbc_ibcast_skip_dt_decision = true; - (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "ibcast_skip_dt_decision", - "In ibcast only use size of communicator to choose algorithm, exclude data type signature. Set to 'false' to use data type signature in decision. WARNING: If you set this to 'false' then your application should not use non-uniform data type signatures in calls to ibcast.", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + mca_coll_libnbc_component.dynamic_rules_verbose = 0; + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, "dynamic_rules_verbose", + "Verbose level of the libnbc coll component regarding on dynamic rules." + " Examples: 0: no verbose, 1: selection errors, 10: selection output", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &libnbc_ibcast_skip_dt_decision); - - libnbc_iallgather_algorithm = 0; - (void) mca_base_var_enum_create("coll_libnbc_iallgather_algorithms", iallgather_algorithms, &new_enum); - mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "iallgather_algorithm", - "Which iallgather algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &libnbc_iallgather_algorithm); - OBJ_RELEASE(new_enum); - - libnbc_iallreduce_algorithm = 0; - (void) mca_base_var_enum_create("coll_libnbc_iallreduce_algorithms", iallreduce_algorithms, &new_enum); - mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "iallreduce_algorithm", - "Which iallreduce algorithm is used: 0 ignore, 1 ring, 2 binomial, 3 rabenseifner, 4 recursive_doubling", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &libnbc_iallreduce_algorithm); - OBJ_RELEASE(new_enum); - - libnbc_ibcast_algorithm = 0; - (void) mca_base_var_enum_create("coll_libnbc_ibcast_algorithms", ibcast_algorithms, &new_enum); - mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "ibcast_algorithm", - "Which ibcast algorithm is used: 0 ignore, 1 linear, 2 binomial, 3 chain, 4 knomial", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &libnbc_ibcast_algorithm); - OBJ_RELEASE(new_enum); - - libnbc_ibcast_knomial_radix = 4; + &mca_coll_libnbc_component.dynamic_rules_verbose); + + mca_coll_libnbc_component.dynamic_rules_filename = NULL; (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "ibcast_knomial_radix", "k-nomial tree radix for the ibcast algorithm (radix > 1)", + "dynamic_rules_filename", + "Filename of configuration file that contains the dynamic (@runtime) decision function rules", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_libnbc_component.dynamic_rules_filename); + + mca_coll_libnbc_component.dynamic_rules_fileformat = 0; + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "dynamic_rules_fileformat", + "Format of configuration file that contains the dynamic (@runtime) decision function rules. Accepted values are: 0 , 1 ", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, + OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, - &libnbc_ibcast_knomial_radix); - - libnbc_iexscan_algorithm = 0; - (void) mca_base_var_enum_create("coll_libnbc_iexscan_algorithms", iexscan_algorithms, &new_enum); - mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "iexscan_algorithm", - "Which iexscan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &libnbc_iexscan_algorithm); - OBJ_RELEASE(new_enum); - - libnbc_ireduce_algorithm = 0; - (void) mca_base_var_enum_create("coll_libnbc_ireduce_algorithms", ireduce_algorithms, &new_enum); - mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "ireduce_algorithm", - "Which ireduce algorithm is used: 0 ignore, 1 chain, 2 binomial, 3 rabenseifner", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &libnbc_ireduce_algorithm); - OBJ_RELEASE(new_enum); - - libnbc_iscan_algorithm = 0; - (void) mca_base_var_enum_create("coll_libnbc_iscan_algorithms", iscan_algorithms, &new_enum); - mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, - "iscan_algorithm", - "Which iscan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", - MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &libnbc_iscan_algorithm); - OBJ_RELEASE(new_enum); + &mca_coll_libnbc_component.dynamic_rules_fileformat); + + ompi_coll_libnbc_allgather_check_forced_init (); + ompi_coll_libnbc_allreduce_check_forced_init (); + ompi_coll_libnbc_alltoall_check_forced_init (); + ompi_coll_libnbc_alltoallv_check_forced_init (); + ompi_coll_libnbc_alltoallw_check_forced_init (); + ompi_coll_libnbc_bcast_check_forced_init (); + ompi_coll_libnbc_exscan_check_forced_init (); + ompi_coll_libnbc_reduce_check_forced_init (); + ompi_coll_libnbc_scan_check_forced_init (); return OMPI_SUCCESS; } @@ -417,6 +353,27 @@ static int libnbc_module_enable(mca_coll_base_module_t *module, struct ompi_communicator_t *comm) { + ompi_coll_libnbc_module_t* nbc_module = (ompi_coll_libnbc_module_t*) module; + int i; + if(mca_coll_libnbc_component.all_base_rules) { + int size, nnodes; + /* Allocate the data that hangs off the communicator */ + if (OMPI_COMM_IS_INTER(comm)) { + size = ompi_comm_remote_size(comm); + } else { + size = ompi_comm_size(comm); + } + /* Get the number of nodes in communicator */ + nnodes = ompi_coll_base_get_nnodes(comm); + for(i=0;icom_rules[i] = ompi_coll_base_get_com_rule_ptr(mca_coll_libnbc_component.all_base_rules, + i, nnodes, size ); + } + } else { + for(i=0;icom_rules[i] = NULL; + } + } /* All done */ return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/libnbc/help-mpi-coll-libnbc.txt b/ompi/mca/coll/libnbc/help-mpi-coll-libnbc.txt new file mode 100644 index 00000000000..6e4e9163943 --- /dev/null +++ b/ompi/mca/coll/libnbc/help-mpi-coll-libnbc.txt @@ -0,0 +1,147 @@ +# -*- text -*- +# +# Copyright (c) 2020 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for Open MPI's libnbc component +# (which use base config file parser). +# +[file fail-1] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The coll file parser was called with a null filename string. +This is an internal error. Parser should be used only if a rules filename was set by user. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-2] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The coll file parser was called with a null rules pointer. +This is an internal error. Rules object pointer must be valid. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-3] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The coll file parser was called with an invalid max collectives number. +This is an internal error. Max collectives number must be greater than 0. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-4] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The file can't be opened. Either file is missing or access rights are wrong. +Check if the file path defined by the mca parameter +OMPI_MCA_coll_libnbc_dynamic_rules_filename is valid. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-5] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The file format is unknown. It must be either + 0: Rules are defined according to a combination of + collective_id, communicator size and message size + or 1: Rules are defined according to a combination of + collective_id, communicator nodes number, communicator size and message size. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-6] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +Rules object allocation failed. This can be induced by a memory resource exhaustion. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-7] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The file parsing aborted. This can be induced by either: + a format mismatch + or a missing line + or an invalid configuration number. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-8] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +Unconsistent collective id or collective number. Collective number can-t be greater than 22 and collective IDs belong to [0,21]. +As a reminder, the collectives IDs are listed below: +IALLGATHER = 0, IALLGATHERV = 1, IALLREDUCE = 2, +IALLTOALL = 3, IALLTOALLV = 4, IALLTOALLW = 5, +IBARRIER = 6, IBCAST = 7, IEXSCAN = 8, IGATHER = 9, IGATHERV = 10, +IREDUCE = 11, IREDUCESCATTER = 12, IREDUCESCATTERBLOCK = 13, ISCAN = 14, +ISCATTER = 15, ISCATTERV = 16, INEIGHBOR_ALLGATHER = 17, INEIGHBOR_ALLGATHERV = 18, +INEIGHBOR_ALLTOALL = 19, INEIGHBOR_ALLTOALLV = 20, INEIGHBOR_ALLTOALLW = 21. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-9] + +The coll/libnbc module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +Incosistent collective ID in rules object. This is an internal error. + +This error leads to libnbc component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) diff --git a/ompi/mca/coll/libnbc/nbc_iallgather.c b/ompi/mca/coll/libnbc/nbc_iallgather.c index 29ba7a6a9c1..e6f3dd867ef 100644 --- a/ompi/mca/coll/libnbc/nbc_iallgather.c +++ b/ompi/mca/coll/libnbc/nbc_iallgather.c @@ -11,6 +11,7 @@ * reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,6 +30,35 @@ static inline int allgather_sched_recursivedoubling( int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype); +int libnbc_iallgather_algorithm = 0; /* iallgather user forced algorithm */ +static mca_base_var_enum_value_t iallgather_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "recursive_doubling"}, + {0, NULL} +}; + +/* The following are used by dynamic and forced rules */ +/* this routine is called by the component only */ + +int ompi_coll_libnbc_allgather_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[ALLGATHER].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_iallgather_algorithms", iallgather_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "iallgather_algorithm", + "Which iallgather algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[ALLGATHER].algorithm); + + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Allgather_args_compare(NBC_Allgather_args *a, NBC_Allgather_args *b, void *param) { @@ -69,19 +99,35 @@ static int nbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype s p = ompi_comm_size (comm); int is_commsize_pow2 = !(p & (p - 1)); - if (libnbc_iallgather_algorithm == 0) { + if(!is_commsize_pow2) { + /* default */ alg = NBC_ALLGATHER_LINEAR; - } else { - /* user forced dynamic decision */ - if (libnbc_iallgather_algorithm == 1) { - alg = NBC_ALLGATHER_LINEAR; - } else if (libnbc_iallgather_algorithm == 2 && is_commsize_pow2) { - alg = NBC_ALLGATHER_RDBL; + } else if (libnbc_module->com_rules[ALLGATHER]) { + int algorithm,dummy1,dummy2,dummy3; + /* compute data size to choose correct rule */ + size_t dsize; + ompi_datatype_type_size (sendtype, &dsize); + dsize *= sendcount; + /* get algorithm */ + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[ALLGATHER], + dsize, &dummy1, &dummy2, &dummy3); + if (algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ } else { + /* default */ alg = NBC_ALLGATHER_LINEAR; } + } else if (0 != mca_coll_libnbc_component.forced_params[ALLGATHER].algorithm) { + alg = mca_coll_libnbc_component.forced_params[ALLGATHER].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + /* default */ + alg = NBC_ALLGATHER_LINEAR; } + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc iallgather : algorithm %d (no segmentation supported)", + alg + 1); + res = ompi_datatype_type_extent(recvtype, &rcvext); if (MPI_SUCCESS != res) { return res; diff --git a/ompi/mca/coll/libnbc/nbc_iallreduce.c b/ompi/mca/coll/libnbc/nbc_iallreduce.c index b8e9f27cbdf..5c91c0c4ac3 100644 --- a/ompi/mca/coll/libnbc/nbc_iallreduce.c +++ b/ompi/mca/coll/libnbc/nbc_iallreduce.c @@ -11,6 +11,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -42,6 +43,38 @@ static inline int allred_sched_redscat_allgather( const void *sbuf, void *rbuf, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf, struct ompi_communicator_t *comm); +static mca_base_var_enum_value_t iallreduce_algorithms[] = { + {0, "ignore"}, + {1, "ring"}, + {2, "binomial"}, + {3, "rabenseifner"}, + {4, "recursive_doubling"}, + {0, NULL} +}; + +typedef enum { NBC_ARED_BINOMIAL, NBC_ARED_RING, NBC_ARED_REDSCAT_ALLGATHER, NBC_ARED_RDBL } ared_algorithm_t; + +/* The following are used by dynamic and forced rules */ +/* this routine is called by the component only */ + +int ompi_coll_libnbc_allreduce_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[ALLREDUCE].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_iallreduce_algorithms", iallreduce_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "iallreduce_algorithm", + "Which iallreduce algorithm is used: 0 ignore, 1 ring, 2 binomial, 3 rabenseifner, 4 recursive_doubling", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[ALLREDUCE].algorithm); + + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Allreduce_args_compare(NBC_Allreduce_args *a, NBC_Allreduce_args *b, void *param) { @@ -61,6 +94,18 @@ int NBC_Allreduce_args_compare(NBC_Allreduce_args *a, NBC_Allreduce_args *b, voi } #endif +static ared_algorithm_t nbc_allreduce_default_algorithm(int p, size_t size, int count, + MPI_Op op, char inplace, int nprocs_pof2) +{ + if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) { + return NBC_ARED_BINOMIAL; + } else if (count >= nprocs_pof2 && ompi_op_is_commute(op)) { + return NBC_ARED_REDSCAT_ALLGATHER; + } else { + return NBC_ARED_RING; + } +} + static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) @@ -72,7 +117,7 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI #ifdef NBC_CACHE_SCHEDULE NBC_Allreduce_args *args, *found, search; #endif - enum { NBC_ARED_BINOMIAL, NBC_ARED_RING, NBC_ARED_REDSCAT_ALLGATHER, NBC_ARED_RDBL } alg; + ared_algorithm_t alg; char inplace; void *tmpbuf = NULL; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; @@ -114,26 +159,32 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI /* algorithm selection */ int nprocs_pof2 = opal_next_poweroftwo(p) >> 1; - if (libnbc_iallreduce_algorithm == 0) { - if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) { - alg = NBC_ARED_BINOMIAL; - } else if (count >= nprocs_pof2 && ompi_op_is_commute(op)) { - alg = NBC_ARED_REDSCAT_ALLGATHER; + + if(libnbc_module->com_rules[ALLREDUCE]) { + int algorithm,dummy1,dummy2,dummy3; + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[ALLREDUCE], + size * count, &dummy1, &dummy2, &dummy3); + if(algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ } else { - alg = NBC_ARED_RING; + /* default */ + alg = nbc_allreduce_default_algorithm (p, size, count, op, inplace, nprocs_pof2); } + } else if(0 != mca_coll_libnbc_component.forced_params[ALLREDUCE].algorithm) { + /* if op is not commutative or MPI_IN_PLACE was specified we have to deal with it */ + alg = mca_coll_libnbc_component.forced_params[ALLREDUCE].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ } else { - if (libnbc_iallreduce_algorithm == 1) - alg = NBC_ARED_RING; - else if (libnbc_iallreduce_algorithm == 2) - alg = NBC_ARED_BINOMIAL; - else if (libnbc_iallreduce_algorithm == 3 && count >= nprocs_pof2 && ompi_op_is_commute(op)) - alg = NBC_ARED_REDSCAT_ALLGATHER; - else if (libnbc_iallreduce_algorithm == 4) - alg = NBC_ARED_RDBL; - else - alg = NBC_ARED_RING; + /* default */ + alg = nbc_allreduce_default_algorithm (p, size, count, op, inplace, nprocs_pof2); + } + + if (NBC_ARED_REDSCAT_ALLGATHER == alg && (count < nprocs_pof2 || !ompi_op_is_commute(op))) { + alg = NBC_ARED_RING; } + + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc iallreduce : algorithm %d (no segmentation supported)", + alg + 1); #ifdef NBC_CACHE_SCHEDULE /* search schedule in communicator specific tree */ search.sendbuf = sendbuf; diff --git a/ompi/mca/coll/libnbc/nbc_ialltoall.c b/ompi/mca/coll/libnbc/nbc_ialltoall.c index 6c5883f23c9..029ec9e11a7 100644 --- a/ompi/mca/coll/libnbc/nbc_ialltoall.c +++ b/ompi/mca/coll/libnbc/nbc_ialltoall.c @@ -12,6 +12,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -33,6 +34,34 @@ static inline int a2a_sched_diss(int rank, int p, MPI_Aint sndext, MPI_Aint rcve static inline int a2a_sched_inplace(int rank, int p, NBC_Schedule* schedule, void* buf, int count, MPI_Datatype type, MPI_Aint ext, ptrdiff_t gap, MPI_Comm comm); +static mca_base_var_enum_value_t ialltoall_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "pairwise"}, + {3, "binomial"}, + {0, NULL} +}; + +/* The following are used by dynamic and forced rules */ +/* this routine is called by the component only */ + +int ompi_coll_libnbc_alltoall_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[ALLTOALL].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_ialltoall_algorithms", ialltoall_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ialltoall_algorithm", + "Which ialltoall algorithm is used unless MPI_IN_PLACE flag has been specified: 0 ignore, 1 linear, 2 pairwise, 3 binomial", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[ALLTOALL].algorithm); + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Alltoall_args_compare(NBC_Alltoall_args *a, NBC_Alltoall_args *b, void *param) { @@ -60,7 +89,7 @@ static int nbc_alltoall_init(const void* sendbuf, int sendcount, MPI_Datatype se { int rank, p, res; MPI_Aint datasize; - size_t a2asize, sndsize; + size_t sndsize; NBC_Schedule *schedule; MPI_Aint rcvext, sndext; #ifdef NBC_CACHE_SCHEDULE @@ -95,22 +124,26 @@ static int nbc_alltoall_init(const void* sendbuf, int sendcount, MPI_Datatype se return res; } - /* algorithm selection */ - a2asize = sndsize * sendcount * p; - /* this number is optimized for TCP on odin.cs.indiana.edu */ - if (inplace) { + if(inplace) { alg = NBC_A2A_INPLACE; - } else if((p <= 8) && ((a2asize < 1<<17) || (sndsize*sendcount < 1<<12))) { - /* just send as fast as we can if we have less than 8 peers, if the - * total communicated size is smaller than 1<<17 *and* if we don't - * have eager messages (msgsize < 1<<13) */ - alg = NBC_A2A_LINEAR; - } else if(a2asize < (1<<12)*(unsigned int)p) { - /*alg = NBC_A2A_DISS;*/ - alg = NBC_A2A_LINEAR; - } else - alg = NBC_A2A_LINEAR; /*NBC_A2A_PAIRWISE;*/ + } else if(libnbc_module->com_rules[ALLTOALL]) { + int algorithm,dummy1,dummy2,dummy3; + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[ALLTOALL], + sndsize * sendcount, &dummy1, &dummy2, &dummy3); + if(algorithm) { + alg = algorithm - 1; + } else { + alg = NBC_A2A_LINEAR; /* default if not inplace */ + } + } else if(0 != mca_coll_libnbc_component.forced_params[ALLTOALL].algorithm) { + alg = mca_coll_libnbc_component.forced_params[ALLTOALL].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_A2A_LINEAR; /* default if not inplace */ + } + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc ialltoall : algorithm %d (no segmentation supported)", + alg + 1); /* allocate temp buffer if we need one */ if (alg == NBC_A2A_INPLACE) { span = opal_datatype_span(&recvtype->super, recvcount, &gap); @@ -197,7 +230,7 @@ static int nbc_alltoall_init(const void* sendbuf, int sendcount, MPI_Datatype se free(tmpbuf); return OMPI_ERR_OUT_OF_RESOURCE; } - + // cppcheck-suppress knownConditionTrueFalse if (!inplace) { /* copy my data to receive buffer */ rbuf = (char *) recvbuf + (MPI_Aint)rank * (MPI_Aint)recvcount * rcvext; diff --git a/ompi/mca/coll/libnbc/nbc_ialltoallv.c b/ompi/mca/coll/libnbc/nbc_ialltoallv.c index 40f04068216..d99e0984efb 100644 --- a/ompi/mca/coll/libnbc/nbc_ialltoallv.c +++ b/ompi/mca/coll/libnbc/nbc_ialltoallv.c @@ -11,6 +11,7 @@ * reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -36,6 +37,34 @@ static inline int a2av_sched_inplace(int rank, int p, NBC_Schedule *schedule, void *buf, const int *counts, const int *displs, MPI_Aint ext, MPI_Datatype type, ptrdiff_t gap); +static mca_base_var_enum_value_t ialltoallv_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "pairwise"}, + {0, NULL} +}; + +/* The following are used by dynamic and forced rules */ +/* this routine is called by the component only */ + +int ompi_coll_libnbc_alltoallv_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[ALLTOALLV].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_ialltoallv_algorithms", ialltoallv_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ialltoallv_algorithm", + "Which ialltoallv algorithm is used unless MPI_IN_PLACE flag has been specified: 0 ignore, 1 linear, 2 pairwise", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[ALLTOALLV].algorithm); + + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + /* an alltoallv schedule can not be cached easily because the contents * ot the recvcounts array may change, so a comparison of the address * would not be sufficient ... we simply do not cache it */ @@ -65,6 +94,34 @@ static int nbc_alltoallv_init(const void* sendbuf, const int *sendcounts, const return res; } + enum {NBC_A2AV_LINEAR, NBC_A2AV_PAIRWISE, NBC_A2AV_INPLACE} alg; + + if (inplace) { + alg = NBC_A2AV_INPLACE; + } else if (libnbc_module->com_rules[ALLTOALLV]) { + int algorithm,dummy1,dummy2,dummy3; + /** + * check to see if we have some filebased rules. As we don't have global + * knowledge about the total amount of data, use the first available rule. + * This allow the users to specify the alltoallv algorithm to be used only + * based on the communicator size. + */ + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[ALLTOALLV], + 0, &dummy1, &dummy2, &dummy3); + if (algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_A2AV_LINEAR; /* default if not inplace */ + } + } else if (0 != mca_coll_libnbc_component.forced_params[ALLTOALLV].algorithm) { + alg = mca_coll_libnbc_component.forced_params[ALLTOALLV].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_A2AV_LINEAR; /* default if not inplace */ + } + + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc ialltoallv : algorithm %d (no segmentation supported)", + alg + 1); /* copy data to receivbuffer */ if (inplace) { int count = 0; @@ -116,14 +173,23 @@ static int nbc_alltoallv_init(const void* sendbuf, const int *sendcounts, const } } - if (inplace) { - res = a2av_sched_inplace(rank, p, schedule, recvbuf, recvcounts, - rdispls, rcvext, recvtype, gap); - } else { - res = a2av_sched_linear(rank, p, schedule, - sendbuf, sendcounts, sdispls, sndext, sendtype, - recvbuf, recvcounts, rdispls, rcvext, recvtype); + switch (alg) { + case NBC_A2AV_INPLACE: + res = a2av_sched_inplace(rank, p, schedule, recvbuf, recvcounts, + rdispls, rcvext, recvtype, gap); + break; + case NBC_A2AV_LINEAR: + res = a2av_sched_linear(rank, p, schedule, + sendbuf, sendcounts, sdispls, sndext, sendtype, + recvbuf, recvcounts, rdispls, rcvext, recvtype); + break; + case NBC_A2AV_PAIRWISE: + res = a2av_sched_pairwise(rank, p, schedule, + sendbuf, sendcounts, sdispls, sndext, sendtype, + recvbuf, recvcounts, rdispls, rcvext, recvtype); + break; } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); diff --git a/ompi/mca/coll/libnbc/nbc_ialltoallw.c b/ompi/mca/coll/libnbc/nbc_ialltoallw.c index 3992bb40073..e8d003ddf17 100644 --- a/ompi/mca/coll/libnbc/nbc_ialltoallw.c +++ b/ompi/mca/coll/libnbc/nbc_ialltoallw.c @@ -11,6 +11,7 @@ * reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -36,6 +37,34 @@ static inline int a2aw_sched_inplace(int rank, int p, NBC_Schedule *schedule, void *buf, const int *counts, const int *displs, struct ompi_datatype_t * const * types); +static mca_base_var_enum_value_t ialltoallw_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "pairwise"}, + {0, NULL} +}; + +/* The following are used by dynamic and forced rules */ +/* this routine is called by the component only */ + +int ompi_coll_libnbc_alltoallw_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[ALLTOALLW].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_alltoallw_algorithms", ialltoallw_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ialltoallw_algorithm", + "Which ialltoallw algorithm is used unless MPI_IN_PLACE flag has been specified: 0 ignore, 1 linear, 2 pairwise", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[ALLTOALLW].algorithm); + + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + /* an alltoallw schedule can not be cached easily because the contents * ot the recvcounts array may change, so a comparison of the address * would not be sufficient ... we simply do not cache it */ @@ -58,6 +87,33 @@ static int nbc_alltoallw_init(const void* sendbuf, const int *sendcounts, const rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); + enum {NBC_A2AW_LINEAR, NBC_A2AW_PAIRWISE, NBC_A2AW_INPLACE} alg; + if (inplace) { + alg = NBC_A2AW_INPLACE; + } else if (libnbc_module->com_rules[ALLTOALLW]) { + int algorithm,dummy1,dummy2,dummy3; + /** + * check to see if we have some filebased rules. As we don't have global + * knowledge about the total amount of data, use the first available rule. + * This allow the users to specify the alltoallw algorithm to be used only + * based on the communicator size. + */ + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[ALLTOALLW], + 0, &dummy1, &dummy2, &dummy3); + if (algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_A2AW_LINEAR; /* default if not inplace */ + } + } else if (0 != mca_coll_libnbc_component.forced_params[ALLTOALLW].algorithm) { + alg = mca_coll_libnbc_component.forced_params[ALLTOALLW].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_A2AW_LINEAR; /* default if not inplace */ + } + + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc ialltoallw : algorithm %d (no segmentation supported)", + alg + 1); /* copy data to receivbuffer */ if (inplace) { ptrdiff_t lgap, lspan; @@ -101,14 +157,23 @@ static int nbc_alltoallw_init(const void* sendbuf, const int *sendcounts, const } } - if (inplace) { - res = a2aw_sched_inplace(rank, p, schedule, recvbuf, - recvcounts, rdispls, recvtypes); - } else { - res = a2aw_sched_linear(rank, p, schedule, - sendbuf, sendcounts, sdispls, sendtypes, - recvbuf, recvcounts, rdispls, recvtypes); + switch(alg) { + case NBC_A2AW_INPLACE: + res = a2aw_sched_inplace(rank, p, schedule, recvbuf, + recvcounts, rdispls, recvtypes); + break; + case NBC_A2AW_LINEAR: + res = a2aw_sched_linear(rank, p, schedule, + sendbuf, sendcounts, sdispls, sendtypes, + recvbuf, recvcounts, rdispls, recvtypes); + break; + case NBC_A2AW_PAIRWISE: + res = a2aw_sched_pairwise(rank, p, schedule, + sendbuf, sendcounts, sdispls, sendtypes, + recvbuf, recvcounts, rdispls, recvtypes); + break; } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); diff --git a/ompi/mca/coll/libnbc/nbc_ibcast.c b/ompi/mca/coll/libnbc/nbc_ibcast.c index cbd381328d0..ea627c30e18 100644 --- a/ompi/mca/coll/libnbc/nbc_ibcast.c +++ b/ompi/mca/coll/libnbc/nbc_ibcast.c @@ -11,6 +11,7 @@ * reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -20,6 +21,9 @@ */ #include "nbc_internal.h" +#define IBCAST_DEFAULT_RADIX 4 +#define IBCAST_DEFAULT_SEGSIZE 16384 + static inline int bcast_sched_binomial(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype); static inline int bcast_sched_linear(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, @@ -29,6 +33,82 @@ static inline int bcast_sched_chain(int rank, int p, int root, NBC_Schedule *sch static inline int bcast_sched_knomial(int rank, int comm_size, int root, NBC_Schedule *schedule, void *buf, int count, MPI_Datatype datatype, int knomial_radix); +static int libnbc_ibcast_knomial_radix; +static bool libnbc_ibcast_skip_dt_decision; + +static mca_base_var_enum_value_t ibcast_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "binomial"}, + {3, "chain"}, + {4, "knomial"}, + {0, NULL} +}; + +typedef enum { NBC_BCAST_LINEAR, NBC_BCAST_BINOMIAL, NBC_BCAST_CHAIN, NBC_BCAST_KNOMIAL } bcast_algorithm_t; + +/* The following are used by dynamic and forced rules */ + +/* this routine is called by the component only */ +/* module does not call this it calls the forced_getvalues routine instead */ + +int ompi_coll_libnbc_bcast_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[BCAST].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_ibcast_algorithms", ibcast_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ibcast_algorithm", + "Which ibcast algorithm is used: 0 ignore, 1 linear, 2 binomial, 3 chain, 4 knomial", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[BCAST].algorithm); + + mca_coll_libnbc_component.forced_params[BCAST].segsize = IBCAST_DEFAULT_SEGSIZE; + mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ibcast_algorithm_segmentsize", + "Segment size in bytes used by default for ibcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[BCAST].segsize); + + libnbc_ibcast_knomial_radix = IBCAST_DEFAULT_RADIX; + + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ibcast_knomial_radix", "k-nomial tree radix for the ibcast algorithm (radix > 1)", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &libnbc_ibcast_knomial_radix); + /* ibcast decision function can make the wrong decision if a legal + * non-uniform data type signature is used. This has resulted in the + * collective operation failing, and possibly producing wrong answers. + * We are investigating a fix for this problem, but it is taking a while. + * https://github.com/open-mpi/ompi/issues/2256 + * https://github.com/open-mpi/ompi/issues/1763 + * As a result we are adding an MCA parameter to make a conservative + * decision to avoid this issue. If the user knows that their application + * does not use data types in this way, then they can set this parameter + * to get the old behavior. Once the issue is truely fixed, then this + * parameter can be removed. + */ + libnbc_ibcast_skip_dt_decision = true; + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ibcast_skip_dt_decision", + "In ibcast only use size of communicator to choose algorithm, exclude data type signature. Set to 'false' to use data type signature in decision. WARNING: If you set this to 'false' then your application should not use non-uniform data type signatures in calls to ibcast.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &libnbc_ibcast_skip_dt_decision); + + + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Bcast_args_compare(NBC_Bcast_args *a, NBC_Bcast_args *b, void *param) { @@ -47,17 +127,47 @@ int NBC_Bcast_args_compare(NBC_Bcast_args *a, NBC_Bcast_args *b, void *param) { } #endif +static bcast_algorithm_t nbc_bcast_default_algorithm(int p, size_t size, int count, + int* segsize) +{ + bcast_algorithm_t alg; + *segsize = IBCAST_DEFAULT_SEGSIZE; + + if( libnbc_ibcast_skip_dt_decision ) { + if (p <= 4) { + alg = NBC_BCAST_LINEAR; + } + else { + alg = NBC_BCAST_BINOMIAL; + } + } + else { + if (p <= 4) { + alg = NBC_BCAST_LINEAR; + } else if (size * count < 65536) { + alg = NBC_BCAST_BINOMIAL; + } else if (size * count < 524288) { + alg = NBC_BCAST_CHAIN; + *segsize = 8192; + } else { + alg = NBC_BCAST_CHAIN; + *segsize = 32768; + } + } + return alg; +} + static int nbc_bcast_init(void *buffer, int count, MPI_Datatype datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { - int rank, p, res, segsize; + int rank, p, res, segsize, radix; size_t size; NBC_Schedule *schedule; #ifdef NBC_CACHE_SCHEDULE NBC_Bcast_args *args, *found, search; #endif - enum { NBC_BCAST_LINEAR, NBC_BCAST_BINOMIAL, NBC_BCAST_CHAIN, NBC_BCAST_KNOMIAL } alg; + bcast_algorithm_t alg; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rank = ompi_comm_rank (comm); @@ -73,43 +183,35 @@ static int nbc_bcast_init(void *buffer, int count, MPI_Datatype datatype, int ro return res; } - segsize = 16384; - /* algorithm selection */ - if (libnbc_ibcast_algorithm == 0) { - if( libnbc_ibcast_skip_dt_decision ) { - if (p <= 4) { - alg = NBC_BCAST_LINEAR; - } - else { - alg = NBC_BCAST_BINOMIAL; - } - } - else { - if (p <= 4) { - alg = NBC_BCAST_LINEAR; - } else if (size * count < 65536) { - alg = NBC_BCAST_BINOMIAL; - } else if (size * count < 524288) { - alg = NBC_BCAST_CHAIN; - segsize = 8192; - } else { - alg = NBC_BCAST_CHAIN; - segsize = 32768; - } - } - } else { - /* user forced dynamic decision */ - if (libnbc_ibcast_algorithm == 1) { - alg = NBC_BCAST_LINEAR; - } else if (libnbc_ibcast_algorithm == 2) { - alg = NBC_BCAST_BINOMIAL; - } else if (libnbc_ibcast_algorithm == 3) { - alg = NBC_BCAST_CHAIN; - } else if (libnbc_ibcast_algorithm == 4 && libnbc_ibcast_knomial_radix > 1) { - alg = NBC_BCAST_KNOMIAL; + if (libnbc_module->com_rules[BCAST]) { + int algorithm, dummy; + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[BCAST], + size * count, &radix, &segsize, &dummy); + if (algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ } else { - alg = NBC_BCAST_LINEAR; + /* get default algorithm ID but keep our segsize value */ + alg = nbc_bcast_default_algorithm(p, size, count, &dummy); } + } else if (0 != mca_coll_libnbc_component.forced_params[BCAST].algorithm) { + alg = mca_coll_libnbc_component.forced_params[BCAST].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + segsize = mca_coll_libnbc_component.forced_params[BCAST].segsize; + radix = libnbc_ibcast_knomial_radix; + } else { + alg = nbc_bcast_default_algorithm(p, size, count, &segsize); + radix = libnbc_ibcast_knomial_radix; + } + + if (NBC_BCAST_KNOMIAL == alg && radix <= 1) { + alg = NBC_BCAST_LINEAR; + } + + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc ibcast : algorithm %d segmentsize %d radix %d", + alg + 1, segsize, radix); + + if(0 == segsize) { + segsize = count * size; /* only one frag */ } #ifdef NBC_CACHE_SCHEDULE @@ -137,7 +239,7 @@ static int nbc_bcast_init(void *buffer, int count, MPI_Datatype datatype, int ro res = bcast_sched_chain(rank, p, root, schedule, buffer, count, datatype, segsize, size); break; case NBC_BCAST_KNOMIAL: - res = bcast_sched_knomial(rank, p, root, schedule, buffer, count, datatype, libnbc_ibcast_knomial_radix); + res = bcast_sched_knomial(rank, p, root, schedule, buffer, count, datatype, radix); break; } diff --git a/ompi/mca/coll/libnbc/nbc_iexscan.c b/ompi/mca/coll/libnbc/nbc_iexscan.c index 547da001dc1..97dec640e1b 100644 --- a/ompi/mca/coll/libnbc/nbc_iexscan.c +++ b/ompi/mca/coll/libnbc/nbc_iexscan.c @@ -11,6 +11,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,6 +33,37 @@ static inline int exscan_sched_recursivedoubling( int count, MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2); +static mca_base_var_enum_value_t iexscan_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "recursive_doubling"}, + {0, NULL} +}; + + +/* The following are used by dynamic and forced rules */ + +/* this routine is called by the component only */ +/* module does not call this it calls the forced_getvalues routine instead */ + +int ompi_coll_libnbc_exscan_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[EXSCAN].algorithm = 0; + + (void) mca_base_var_enum_create("coll_libnbc_iexscan_algorithms", iexscan_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "iexscan_algorithm", + "Which iexscan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[EXSCAN].algorithm); + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { @@ -71,16 +103,34 @@ static int nbc_exscan_init(const void* sendbuf, void* recvbuf, int count, MPI_Da return nbc_get_noop_request(persistent, request); } + if (libnbc_module->com_rules[EXSCAN]) { + /* compute data size to choose correct rule */ + size_t dsize; + ompi_datatype_type_size (datatype, &dsize); + dsize *= count; + /* get algorithm */ + int algorithm, dummy1, dummy2, dummy3; + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[EXSCAN], + dsize, &dummy1, &dummy2, &dummy3); + if (algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_EXSCAN_LINEAR; /* default */ + } + } else if (0 != mca_coll_libnbc_component.forced_params[EXSCAN].algorithm) { + alg = mca_coll_libnbc_component.forced_params[EXSCAN].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_EXSCAN_LINEAR; /* default */ + } + span = opal_datatype_span(&datatype->super, count, &gap); - if (libnbc_iexscan_algorithm == 2) { - alg = NBC_EXSCAN_RDBL; + if (NBC_EXSCAN_RDBL == alg) { ptrdiff_t span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t); tmpbuf = malloc(span_align + span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } tmpbuf1 = (void *)(-gap); tmpbuf2 = (char *)(span_align) - gap; } else { - alg = NBC_EXSCAN_LINEAR; if (rank > 0) { tmpbuf = malloc(span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } diff --git a/ompi/mca/coll/libnbc/nbc_ireduce.c b/ompi/mca/coll/libnbc/nbc_ireduce.c index da50f1eb276..22d3d5f5584 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce.c @@ -11,6 +11,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,6 +27,8 @@ #include "nbc_internal.h" +#define IREDUCE_DEFAULT_SEGSIZE 8192 + static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, char tmpredbuf, int count, MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf); static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, @@ -38,6 +41,47 @@ static inline int red_sched_redscat_gather( char tmpredbuf, int count, MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmp_buf, struct ompi_communicator_t *comm); +typedef enum { NBC_RED_BINOMIAL, NBC_RED_CHAIN, NBC_RED_REDSCAT_GATHER} reduce_algorithm_t; + +static mca_base_var_enum_value_t ireduce_algorithms[] = { + {0, "ignore"}, + {1, "chain"}, + {2, "binomial"}, + {3, "rabenseifner"}, + {0, NULL} +}; + +/* The following are used by dynamic and forced rules */ + +/* this routine is called by the component only */ +/* module does not call this it calls the forced_getvalues routine instead */ + +int ompi_coll_libnbc_reduce_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + mca_coll_libnbc_component.forced_params[REDUCE].algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_ireduce_algorithms", ireduce_algorithms, &new_enum); + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ireduce_algorithm", + "Which ireduce algorithm is used: 0 ignore, 1 chain, 2 binomial, 3 rabenseifner", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[REDUCE].algorithm); + + mca_coll_libnbc_component.forced_params[REDUCE].segsize = IREDUCE_DEFAULT_SEGSIZE; + mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ireduce_algorithm_segmentsize", + "Segment size in bytes used by default for ireduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[REDUCE].segsize); + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Reduce_args_compare(NBC_Reduce_args *a, NBC_Reduce_args *b, void *param) { @@ -58,6 +102,18 @@ int NBC_Reduce_args_compare(NBC_Reduce_args *a, NBC_Reduce_args *b, void *param) } #endif +static reduce_algorithm_t nbc_reduce_default_algorithm(int p, size_t size, int count, + MPI_Op op, int nprocs_pof2) +{ + if (ompi_op_is_commute(op) && p > 2 && count >= nprocs_pof2) { + return NBC_RED_REDSCAT_GATHER; + } else if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) { + return NBC_RED_BINOMIAL; + } else { + return NBC_RED_CHAIN; + } +} + /* the non-blocking reduce */ static int nbc_reduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, @@ -102,27 +158,30 @@ static int nbc_reduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Da } span = opal_datatype_span(&datatype->super, count, &gap); + int nprocs_pof2 = opal_next_poweroftwo(p) >> 1; /* algorithm selection */ - int nprocs_pof2 = opal_next_poweroftwo(p) >> 1; - if (libnbc_ireduce_algorithm == 0) { - if (ompi_op_is_commute(op) && p > 2 && count >= nprocs_pof2) { - alg = NBC_RED_REDSCAT_GATHER; - } else if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) { - alg = NBC_RED_BINOMIAL; + if (libnbc_module->com_rules[REDUCE]) { + int algorithm,dummy1,dummy2; + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[REDUCE], + size * count, &dummy1, &segsize, &dummy2); + if (algorithm) { + alg = algorithm - 1;/* -1 is to shift from algorithm ID to enum */ } else { - alg = NBC_RED_CHAIN; + alg = nbc_reduce_default_algorithm(p, size, count, op, nprocs_pof2); } + } else if (0 != mca_coll_libnbc_component.forced_params[REDUCE].algorithm) { + alg = mca_coll_libnbc_component.forced_params[REDUCE].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + segsize = mca_coll_libnbc_component.forced_params[REDUCE].segsize; } else { - if (libnbc_ireduce_algorithm == 1) { - alg = NBC_RED_CHAIN; - } else if (libnbc_ireduce_algorithm == 2) { - alg = NBC_RED_BINOMIAL; - } else if (libnbc_ireduce_algorithm == 3 && ompi_op_is_commute(op) && p > 2 && count >= nprocs_pof2) { - alg = NBC_RED_REDSCAT_GATHER; - } else { - alg = NBC_RED_CHAIN; - } + /* default */ + alg = nbc_reduce_default_algorithm(p, size, count, op, nprocs_pof2); + segsize = IREDUCE_DEFAULT_SEGSIZE; + } + + if (NBC_RED_REDSCAT_GATHER == alg && (!ompi_op_is_commute(op) || p <= 2 || count < nprocs_pof2)) { + alg = NBC_RED_CHAIN; + segsize = IREDUCE_DEFAULT_SEGSIZE; } /* allocate temporary buffers */ @@ -140,9 +199,14 @@ static int nbc_reduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Da } } else { tmpbuf = malloc (span); - segsize = 16384/2; } + opal_output_verbose(10, mca_coll_libnbc_component.stream, + "Libnbc ireduce : algorithm %d segmentsize %d", + alg + 1, segsize); + if(0 == segsize) { + segsize = count * size; /* only one frag */ + } if (OPAL_UNLIKELY(NULL == tmpbuf)) { return OMPI_ERR_OUT_OF_RESOURCE; } diff --git a/ompi/mca/coll/libnbc/nbc_iscan.c b/ompi/mca/coll/libnbc/nbc_iscan.c index ccc531d669e..317a1c537cf 100644 --- a/ompi/mca/coll/libnbc/nbc_iscan.c +++ b/ompi/mca/coll/libnbc/nbc_iscan.c @@ -11,6 +11,7 @@ * reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,6 +33,36 @@ static inline int scan_sched_recursivedoubling( int count, MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2); +static mca_base_var_enum_value_t iscan_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "recursive_doubling"}, + {0, NULL} +}; + +/* The following are used by dynamic and forced rules */ + +/* this routine is called by the component only */ +/* module does not call this it calls the forced_getvalues routine instead */ + +int ompi_coll_libnbc_scan_check_forced_init (void) +{ + mca_base_var_enum_t *new_enum; + + (void) mca_base_var_enum_create("coll_libnbc_iscan_algorithms", iscan_algorithms, &new_enum); + + mca_coll_libnbc_component.forced_params[SCAN].algorithm = 0; + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "iscan_algorithm", + "Which scan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &mca_coll_libnbc_component.forced_params[SCAN].algorithm); + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; +} + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { @@ -71,16 +102,34 @@ static int nbc_scan_init(const void* sendbuf, void* recvbuf, int count, MPI_Data return nbc_get_noop_request(persistent, request); } + if (libnbc_module->com_rules[SCAN]) { + /* compute data size to choose correct rule */ + size_t dsize; + ompi_datatype_type_size (datatype, &dsize); + dsize *= count; + /* get algorithm */ + int algorithm, dummy1, dummy2, dummy3; + algorithm = ompi_coll_base_get_target_method_params (libnbc_module->com_rules[SCAN], + dsize, &dummy1, &dummy2, &dummy3); + if (algorithm) { + alg = algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_SCAN_LINEAR; /* default */ + } + } else if (0 != mca_coll_libnbc_component.forced_params[SCAN].algorithm) { + alg = mca_coll_libnbc_component.forced_params[SCAN].algorithm - 1; /* -1 is to shift from algorithm ID to enum */ + } else { + alg = NBC_SCAN_LINEAR; /* default */ + } + span = opal_datatype_span(&datatype->super, count, &gap); - if (libnbc_iscan_algorithm == 2) { - alg = NBC_SCAN_RDBL; + if (NBC_SCAN_RDBL == alg) { ptrdiff_t span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t); tmpbuf = malloc(span_align + span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } tmpbuf1 = (void *)(-gap); tmpbuf2 = (char *)(span_align) - gap; } else { - alg = NBC_SCAN_LINEAR; if (rank > 0) { tmpbuf = malloc(span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } diff --git a/ompi/mca/coll/tuned/Makefile.am b/ompi/mca/coll/tuned/Makefile.am index 82be7bb72aa..52f3b977f5f 100644 --- a/ompi/mca/coll/tuned/Makefile.am +++ b/ompi/mca/coll/tuned/Makefile.am @@ -20,14 +20,12 @@ # $HEADER$ # +dist_ompidata_DATA = help-mpi-coll-tuned.txt + sources = \ coll_tuned.h \ - coll_tuned_dynamic_file.h \ - coll_tuned_dynamic_rules.h \ coll_tuned_decision_fixed.c \ coll_tuned_decision_dynamic.c \ - coll_tuned_dynamic_file.c \ - coll_tuned_dynamic_rules.c \ coll_tuned_component.c \ coll_tuned_module.c \ coll_tuned_allgather_decision.c \ diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index e4d66cc6004..7a92f281166 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -6,6 +6,7 @@ * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2019 Mellanox Technologies. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,7 +26,7 @@ #include "opal/util/output.h" /* also need the dynamic rule structures */ -#include "coll_tuned_dynamic_rules.h" +#include "ompi/mca/coll/base/coll_base_dynamic_rules.h" BEGIN_C_DECLS @@ -34,6 +35,7 @@ extern int ompi_coll_tuned_stream; extern int ompi_coll_tuned_priority; extern bool ompi_coll_tuned_use_dynamic_rules; extern char* ompi_coll_tuned_dynamic_rules_filename; +extern int ompi_coll_tuned_dynamic_rules_fileformat; extern int ompi_coll_tuned_init_tree_fanout; extern int ompi_coll_tuned_init_chain_fanout; extern int ompi_coll_tuned_init_max_requests; @@ -196,7 +198,7 @@ struct mca_coll_tuned_component_t { /* MCA parameters first */ /* cached decision table stuff (moved from MCW module) */ - ompi_coll_alg_rule_t *all_base_rules; + ompi_coll_base_alg_rule_t *all_base_rules; }; /** * Convenience typedef @@ -216,7 +218,7 @@ struct mca_coll_tuned_module_t { coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT]; /* the communicator rules for each MPI collective for ONLY my comsize */ - ompi_coll_com_rule_t *com_rules[COLLCOUNT]; + ompi_coll_base_com_rule_t *com_rules[COLLCOUNT]; }; typedef struct mca_coll_tuned_module_t mca_coll_tuned_module_t; OBJ_CLASS_DECLARATION(mca_coll_tuned_module_t); diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index 7f6764d5f98..91974547f5e 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -17,6 +17,7 @@ * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2019 Mellanox Technologies. All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,12 +33,13 @@ #include "ompi_config.h" #include "opal/util/output.h" +#include "opal/util/show_help.h" #include "coll_tuned.h" #include "mpi.h" #include "ompi/mca/coll/coll.h" #include "coll_tuned.h" -#include "coll_tuned_dynamic_file.h" +#include "ompi/mca/coll/base/coll_base_dynamic_file.h" /* * Public string showing the coll ompi_tuned component version number @@ -49,9 +51,11 @@ const char *ompi_coll_tuned_component_version_string = * Global variable */ int ompi_coll_tuned_stream = -1; +int ompi_coll_tuned_verbose = 0; int ompi_coll_tuned_priority = 30; bool ompi_coll_tuned_use_dynamic_rules = false; char* ompi_coll_tuned_dynamic_rules_filename = (char*) NULL; +int ompi_coll_tuned_dynamic_rules_fileformat = 0; int ompi_coll_tuned_init_tree_fanout = 4; int ompi_coll_tuned_init_chain_fanout = 4; int ompi_coll_tuned_init_max_requests = 128; @@ -191,6 +195,23 @@ static int tuned_register(void) MCA_BASE_VAR_SCOPE_READONLY, &ompi_coll_tuned_dynamic_rules_filename); + ompi_coll_tuned_dynamic_rules_fileformat = 0; + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "dynamic_rules_fileformat", + "Format of configuration file that contains the dynamic (@runtime) decision function rules. Accepted values are: 0 , 1 ", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_coll_tuned_dynamic_rules_fileformat); + ompi_coll_tuned_verbose = 0; + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "verbose", + "Verbose level of the tuned coll component." + " Examples: 0: no verbose, 1: errors, 10: basic, 50: dynamic rules, 100: maximum verbosity level.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_coll_tuned_verbose); + /* register forced params */ ompi_coll_tuned_allreduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLREDUCE]); ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALL]); @@ -212,13 +233,10 @@ static int tuned_register(void) static int tuned_open(void) { - int rc; - -#if OPAL_ENABLE_DEBUG - if (ompi_coll_base_framework.framework_verbose) { + if (ompi_coll_tuned_verbose > 0) { ompi_coll_tuned_stream = opal_output_open(NULL); + opal_output_set_verbosity(ompi_coll_tuned_stream, ompi_coll_tuned_verbose); } -#endif /* OPAL_ENABLE_DEBUG */ /* now check that the user hasn't overrode any of the decision functions if dynamic rules are enabled */ /* the user can redo this before every comm dup/create if they like */ @@ -231,20 +249,32 @@ static int tuned_open(void) /* by default DISABLE dynamic rules and instead use fixed [if based] rules */ if (ompi_coll_tuned_use_dynamic_rules) { if( ompi_coll_tuned_dynamic_rules_filename ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:component_open Reading collective rules file [%s]", - ompi_coll_tuned_dynamic_rules_filename)); - rc = ompi_coll_tuned_read_rules_config_file( ompi_coll_tuned_dynamic_rules_filename, - &(mca_coll_tuned_component.all_base_rules), COLLCOUNT); + int rc; + opal_output_verbose(10, ompi_coll_tuned_stream, + "coll:tuned:component_open Reading collective rules file [%s] which format is %d", + ompi_coll_tuned_dynamic_rules_filename, + ompi_coll_tuned_dynamic_rules_fileformat); + rc = ompi_coll_base_read_rules_config_file( ompi_coll_tuned_dynamic_rules_filename, + ompi_coll_tuned_dynamic_rules_fileformat, + &(mca_coll_tuned_component.all_base_rules), COLLCOUNT); if( rc >= 0 ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_open Read %d valid rules\n", rc)); + opal_output_verbose(10, ompi_coll_tuned_stream,"coll:tuned:module_open Read %d valid rules\n", rc); + if(ompi_coll_base_framework.framework_verbose >= 50) { + ompi_coll_base_dump_all_rules (mca_coll_tuned_component.all_base_rules, COLLCOUNT); + } } else { - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_open Reading collective rules file failed\n")); + opal_output_verbose(1, ompi_coll_tuned_stream,"coll:tuned:module_open Reading collective rules file failed\n"); + char error_name[12]; + sprintf(error_name,"file fail%1d", rc); + error_name[11] = '\0'; + opal_show_help("help-mpi-coll-tuned.txt", (const char*)error_name, true, + ompi_coll_tuned_dynamic_rules_filename, ompi_coll_tuned_dynamic_rules_fileformat); mca_coll_tuned_component.all_base_rules = NULL; } } } - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_open: done!")); + opal_output_verbose(10, ompi_coll_tuned_stream, "coll:tuned:component_open: done!"); return OMPI_SUCCESS; } @@ -253,17 +283,17 @@ static int tuned_open(void) /* i.e. alg table and dynamic changable rules if allocated etc */ static int tuned_close(void) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_close: called")); - /* dealloc alg table if allocated */ /* dealloc dynamic changable rules if allocated */ - - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_close: done!")); - if( NULL != mca_coll_tuned_component.all_base_rules ) { - ompi_coll_tuned_free_all_rules(mca_coll_tuned_component.all_base_rules, COLLCOUNT); + ompi_coll_base_free_all_rules(mca_coll_tuned_component.all_base_rules, COLLCOUNT); mca_coll_tuned_component.all_base_rules = NULL; } + opal_output_verbose(10, ompi_coll_tuned_stream, "coll:tuned:component_close: done!"); + /* close stream */ + if(ompi_coll_tuned_stream >= 0) { + opal_output_close(ompi_coll_tuned_stream); + } return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c index 491141cbdd6..8c0e8d96da4 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c @@ -14,6 +14,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -71,7 +72,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (const void *sbuf, void *rbuf, int c ompi_datatype_type_size (dtype, &dsize); dsize *= count; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLREDUCE], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[ALLREDUCE], dsize, &faninout, &segsize, &ignoreme); if (alg) { @@ -122,7 +123,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(const void *sbuf, int scount, comsize = ompi_comm_size(comm); dsize *= (ptrdiff_t)comsize * (ptrdiff_t)scount; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLTOALL], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[ALLTOALL], dsize, &faninout, &segsize, &max_requests); if (alg) { @@ -174,7 +175,7 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(const void *sbuf, const int *sco if (tuned_module->com_rules[ALLTOALLV]) { int alg, faninout, segsize, max_requests; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLTOALLV], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[ALLTOALLV], 0, &faninout, &segsize, &max_requests); if (alg) { @@ -216,7 +217,7 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm, /* we do, so calc the message size or what ever we need and use this for the evaluation */ int alg, faninout, segsize, ignoreme; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[BARRIER], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[BARRIER], 0, &faninout, &segsize, &ignoreme); if (alg) { @@ -260,7 +261,7 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buf, int count, ompi_datatype_type_size (dtype, &dsize); dsize *= count; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[BCAST], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[BCAST], dsize, &faninout, &segsize, &ignoreme); if (alg) { @@ -311,7 +312,7 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( const void *sbuf, void *rbuf, ompi_datatype_type_size(dtype, &dsize); dsize *= count; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[REDUCE], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[REDUCE], dsize, &faninout, &segsize, &max_requests); if (alg) { @@ -366,7 +367,7 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(const void *sbuf, void *rbu ompi_datatype_type_size (dtype, &dsize); dsize *= count; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[REDUCESCATTER], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[REDUCESCATTER], dsize, &faninout, &segsize, &ignoreme); if (alg) { @@ -418,7 +419,7 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(const void *sbuf, voi ompi_datatype_type_size (dtype, &dsize); dsize *= rcount * size; - alg = ompi_coll_tuned_get_target_method_params(tuned_module->com_rules[REDUCESCATTERBLOCK], + alg = ompi_coll_base_get_target_method_params(tuned_module->com_rules[REDUCESCATTERBLOCK], dsize, &faninout, &segsize, &ignoreme); if (alg) { @@ -472,7 +473,7 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(const void *sbuf, int scount, comsize = ompi_comm_size(comm); dsize *= (ptrdiff_t)comsize * (ptrdiff_t)scount; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLGATHER], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[ALLGATHER], dsize, &faninout, &segsize, &ignoreme); if (alg) { /* we have found a valid choice from the file based rules for @@ -537,7 +538,7 @@ int ompi_coll_tuned_allgatherv_intra_dec_dynamic(const void *sbuf, int scount, per_rank_size = total_size / comsize; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLGATHERV], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[ALLGATHERV], per_rank_size, &faninout, &segsize, &ignoreme); if (alg) { /* we have found a valid choice from the file based rules for @@ -592,7 +593,7 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, int scount, ompi_datatype_type_size (sdtype, &dsize); dsize *= scount * comsize; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[GATHER], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[GATHER], dsize, &faninout, &segsize, &max_requests); if (alg) { @@ -641,7 +642,7 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(const void *sbuf, int scount, ompi_datatype_type_size (sdtype, &dsize); dsize *= scount * comsize; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[SCATTER], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[SCATTER], dsize, &faninout, &segsize, &max_requests); if (alg) { @@ -689,7 +690,7 @@ int ompi_coll_tuned_exscan_intra_dec_dynamic(const void *sbuf, void* rbuf, int c ompi_datatype_type_size (dtype, &dsize); dsize *= comsize; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[EXSCAN], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[EXSCAN], dsize, &faninout, &segsize, &max_requests); if (alg) { @@ -732,7 +733,7 @@ int ompi_coll_tuned_scan_intra_dec_dynamic(const void *sbuf, void* rbuf, int cou ompi_datatype_type_size (dtype, &dsize); dsize *= comsize; - alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[SCAN], + alg = ompi_coll_base_get_target_method_params (tuned_module->com_rules[SCAN], dsize, &faninout, &segsize, &max_requests); if (alg) { diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c deleted file mode 100644 index a259c789ac2..00000000000 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2020 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include -#include - -#include "mpi.h" -#include "ompi/mca/mca.h" -#include "coll_tuned.h" - -/* need to include our own topo prototypes so we can malloc data on the comm correctly */ -#include "ompi/mca/coll/base/coll_base_topo.h" - -/* need file reading function */ -#include "ompi/mca/coll/base/coll_base_util.h" - -/* also need the dynamic rule structures */ -#include "coll_tuned_dynamic_rules.h" - -/* and our own prototypes */ -#include "coll_tuned_dynamic_file.h" - -static int fileline=0; /* used for verbose error messages */ - -#define getnext(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval) - -/* - * Reads a rule file called fname - * Builds the algorithm rule table for a max of n_collectives - * - * If an error occurs it removes rule table and then exits with a very verbose - * error message (this stops the user using a half baked rule table - * - * Returns the number of actual collectives that a rule exists for - * (note 0 is NOT an error) - * - */ - -int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives) -{ - long CI, NCS, CS, ALG, NMS, FANINOUT, X, MS, SS; - FILE *fptr = (FILE*) NULL; - int x, ncs, nms; - - ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL; /* complete table of rules */ - - /* individual pointers to sections of rules */ - ompi_coll_alg_rule_t *alg_p = (ompi_coll_alg_rule_t*) NULL; - ompi_coll_com_rule_t *com_p = (ompi_coll_com_rule_t*) NULL; - ompi_coll_msg_rule_t *msg_p = (ompi_coll_msg_rule_t*) NULL; - - /* stats info */ - int total_alg_count = 0; - int total_com_count = 0; - int total_msg_count = 0; - - if (!fname) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Gave NULL as rule table configuration file for tuned collectives... ignoring!\n")); - return (-1); - } - - if (!rules) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Gave NULL as rule table result ptr!... ignoring!\n")); - return (-2); - } - - if (n_collectives<1) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Gave %d as max number of collectives in the rule table configuration file for tuned collectives!... ignoring!\n", n_collectives)); - return (-3); - } - - fptr = fopen (fname, "r"); - if (!fptr) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot read rules file [%s]\n", fname)); - goto on_file_error; - } - - /* make space and init the algorithm rules for each of the n_collectives MPI collectives */ - alg_rules = ompi_coll_tuned_mk_alg_rules (n_collectives); - if (NULL == alg_rules) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate rules for file [%s]\n", fname)); - goto on_file_error; - } - - if( (getnext(fptr, &X) < 0) || (X < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline)); - goto on_file_error; - } - if (X>n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline)); - goto on_file_error; - } - - for (x=0;x=n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline)); - goto on_file_error; - } - - if (alg_rules[CI].alg_rule_id != CI) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %ld\n", CI)); - goto on_file_error; - } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %ld\n", CI)); - alg_p = &alg_rules[CI]; - - alg_p->alg_rule_id = CI; - alg_p->n_com_sizes = 0; - alg_p->com_rules = (ompi_coll_com_rule_t *) NULL; - - if( (getnext (fptr, &NCS) < 0) || (NCS < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %ld at around line %d\n", CI, fileline)); - goto on_file_error; - } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCS, CI)); - alg_p->n_com_sizes = NCS; - alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI); - if (NULL == alg_p->com_rules) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate com rules for file [%s]\n", fname)); - goto on_file_error; - } - - for (ncs=0;ncscom_rules[ncs]); - - if( (getnext (fptr, &CS) < 0) || (CS < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline)); - goto on_file_error; - } - - com_p->mpi_comsize = CS; - - if( (getnext (fptr, &NMS) < 0) || (NMS < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline)); - goto on_file_error; - } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %ld for dynamic rule for collective ID %ld and comm size %ld\n", - NMS, CI, CS)); - com_p->n_msg_sizes = NMS; - com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS); - if (NULL == com_p->msg_rules) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate msg rules for file [%s]\n", fname)); - goto on_file_error; - } - - msg_p = com_p->msg_rules; - - for (nms=0;nmsmsg_rules[nms]); - - if( (getnext (fptr, &MS) < 0) || (MS < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); - goto on_file_error; - } - msg_p->msg_size = (size_t)MS; - - if( (getnext (fptr, &ALG) < 0) || (ALG < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); - goto on_file_error; - } - msg_p->result_alg = ALG; - - if( (getnext (fptr, &FANINOUT) < 0) || (FANINOUT < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); - goto on_file_error; - } - msg_p->result_topo_faninout = FANINOUT; - - if( (getnext (fptr, &SS) < 0) || (SS < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); - goto on_file_error; - } - msg_p->result_segsize = SS; - - if (!nms && MS) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline)); - goto on_file_error; - } - - total_msg_count++; - - } /* msg size */ - - total_com_count++; - - } /* comm size */ - - total_alg_count++; - OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %ld\n", CI)); - - } /* per collective */ - - fclose (fptr); - - OPAL_OUTPUT((ompi_coll_tuned_stream,"\nConfigure file Stats\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Collectives with rules\t\t\t: %5d\n", total_alg_count)); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Communicator sizes with rules\t\t: %5d\n", total_com_count)); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Message sizes with rules\t\t: %5d\n", total_msg_count)); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Lines in configuration file read\t\t: %5d\n", fileline)); - - /* return the rules to the caller */ - *rules = alg_rules; - - return (total_alg_count); - - - on_file_error: - - /* here we close out the file and delete any memory allocated nicely */ - /* we return back a verbose message and a count of -1 algorithms read */ - /* draconian but its better than having a bad collective decision table */ - - OPAL_OUTPUT((ompi_coll_tuned_stream,"read_rules_config_file: bad configure file [%s]. Read afar as line %d\n", fname, fileline)); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Ignoring user supplied tuned collectives configuration decision file.\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Switching back to [compiled in] fixed decision table.\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Fix errors as listed above and try again.\n")); - - /* deallocate memory if allocated */ - if (alg_rules) ompi_coll_tuned_free_all_rules (alg_rules, n_collectives); - - /* close file */ - if (fptr) fclose (fptr); - - *rules = (ompi_coll_alg_rule_t*) NULL; - return (-1); -} - diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.h b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.h deleted file mode 100644 index 595e436fa49..00000000000 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_COLL_TUNED_DYNAMIC_FILE_H_HAS_BEEN_INCLUDED -#define MCA_COLL_TUNED_DYNAMIC_FILE_H_HAS_BEEN_INCLUDED - -#include "ompi_config.h" - -/* also need the dynamic rule structures */ -#include "coll_tuned_dynamic_rules.h" - - -BEGIN_C_DECLS - -int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives); - - -END_C_DECLS -#endif /* MCA_COLL_TUNED_DYNAMIC_FILE_H_HAS_BEEN_INCLUDED */ - - diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c deleted file mode 100644 index 2c2b4469635..00000000000 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c +++ /dev/null @@ -1,391 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2015 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2011-2012 FUJITSU LIMITED. All rights reserved. - * Copyright (c) 2017 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "mpi.h" -#include "ompi/mca/mca.h" -#include "ompi/constants.h" -#include "coll_tuned.h" - -/* need to include our own topo prototypes so we can malloc data on the comm correctly */ -#include "ompi/mca/coll/base/coll_base_topo.h" - -/* also need the dynamic rule structures */ -#include "coll_tuned_dynamic_rules.h" - -#include -#include - -#include "ompi/mca/coll/base/coll_base_util.h" - - -ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg) -{ - int i; - ompi_coll_alg_rule_t* alg_rules; - - alg_rules = (ompi_coll_alg_rule_t *) calloc (n_alg, sizeof (ompi_coll_alg_rule_t)); - if (!alg_rules) return (alg_rules); - - /* set all we can at this point */ - for (i=0;ialg_rule_id, - msg_p->com_rule_id, msg_p->mpi_comsize, msg_p->msg_rule_id)); - - OPAL_OUTPUT((ompi_coll_tuned_stream,"msg_size %10lu -> algorithm %2d\ttopo in/out %2d\tsegsize %5ld\tmax_requests %4d\n", - msg_p->msg_size, msg_p->result_alg, msg_p->result_topo_faninout, msg_p->result_segsize, - msg_p->result_max_requests)); - - return (0); -} - - -int ompi_coll_tuned_dump_com_rule (ompi_coll_com_rule_t* com_p) -{ - int i; - - if (!com_p) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Com rule was a NULL ptr?!\n")); - return (-1); - } - - OPAL_OUTPUT((ompi_coll_tuned_stream, "alg_id %3d\tcom_id %3d\tcom_size %3d\t", com_p->alg_rule_id, com_p->com_rule_id, com_p->mpi_comsize)); - - if (!com_p->n_msg_sizes) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"no msgsizes defined\n")); - return (0); - } - - OPAL_OUTPUT((ompi_coll_tuned_stream,"number of message sizes %3d\n", com_p->n_msg_sizes)); - - for (i=0;in_msg_sizes;i++) { - ompi_coll_tuned_dump_msg_rule (&(com_p->msg_rules[i])); - } - - return (0); -} - - -int ompi_coll_tuned_dump_alg_rule (ompi_coll_alg_rule_t* alg_p) -{ - int i; - - if (!alg_p) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Algorithm rule was a NULL ptr?!\n")); - return (-1); - } - - OPAL_OUTPUT((ompi_coll_tuned_stream,"alg_id %3d\t", alg_p->alg_rule_id)); - - if (!alg_p->n_com_sizes) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"no coms defined\n")); - return (0); - } - - OPAL_OUTPUT((ompi_coll_tuned_stream,"number of com sizes %3d\n", alg_p->n_com_sizes)); - - for (i=0;in_com_sizes;i++) { - ompi_coll_tuned_dump_com_rule (&(alg_p->com_rules[i])); - } - - return (0); -} - - -int ompi_coll_tuned_dump_all_rules (ompi_coll_alg_rule_t* alg_p, int n_rules) -{ - int i; - - if (!alg_p) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Algorithm rule was a NULL ptr?!\n")); - return (-1); - } - - OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of algorithm rules %3d\n", n_rules)); - - for (i=0;in_msg_sizes) { - msg_p = com_p->msg_rules; - - if (!msg_p) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL msg_rules when msg count was %d\n", com_p->n_msg_sizes)); - rc = -1; /* some error */ - } - else { - /* ok, memory exists for the msg rules so free that first */ - free (com_p->msg_rules); - com_p->msg_rules = (ompi_coll_msg_rule_t*) NULL; - } - - } /* if we have msg rules to free as well */ - - return (rc); -} - - -int ompi_coll_tuned_free_coms_in_alg_rule (ompi_coll_alg_rule_t* alg_p) -{ - int rc=0; - int i; - - ompi_coll_com_rule_t* com_p; - - if (!alg_p) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL alg_rule ptr\n")); - return (-1); - } - - if (alg_p->n_com_sizes) { - com_p = alg_p->com_rules; - - if (!com_p) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL com_rules when com count was %d\n", alg_p->n_com_sizes)); - } else { - /* ok, memory exists for the com rules so free their message rules first */ - for( i = 0; i < alg_p->n_com_sizes; i++ ) { - com_p = &(alg_p->com_rules[i]); - ompi_coll_tuned_free_msg_rules_in_com_rule (com_p); - } - /* we are now free to free the com rules themselives */ - free (alg_p->com_rules); - alg_p->com_rules = (ompi_coll_com_rule_t*) NULL; - } - - } /* if we have msg rules to free as well */ - - return (rc); -} - - -int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs) -{ - int i; - int rc = 0; - - for( i = 0; i < n_algs; i++ ) { - rc += ompi_coll_tuned_free_coms_in_alg_rule (&(alg_p[i])); - } - - free (alg_p); - - return (rc); -} - -/* - * query functions - * i.e. the functions that get me the algorithm, topo fanin/out and segment size fast - * and also get the rules that are needed by each communicator as needed - * - */ - -/* - * This function is used to get the pointer to the nearest (less than or equal) - * com rule for this MPI collective (alg_id) for a given - * MPI communicator size. The complete rule base must be presented. - * - * If no rule exits returns NULL, else the com rule ptr - * (which can be used in the coll_tuned_get_target_method_params() call) - * - */ -ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* rules, int alg_id, int mpi_comsize) -{ - ompi_coll_alg_rule_t* alg_p = (ompi_coll_alg_rule_t*) NULL; - ompi_coll_com_rule_t* com_p = (ompi_coll_com_rule_t*) NULL; - ompi_coll_com_rule_t* best_com_p = (ompi_coll_com_rule_t*) NULL; - int i; - - if (!rules) { /* no rule base no resulting com rule */ - return ((ompi_coll_com_rule_t*)NULL); - } - - alg_p = &(rules[alg_id]); /* get the algorithm rule pointer */ - - if (!alg_p->n_com_sizes) { /* check for count of communicator sizes */ - return ((ompi_coll_com_rule_t*)NULL); /* no com sizes so no rule */ - } - - /* ok have some com sizes, now to find the one closest to my mpi_comsize */ - - /* make a copy of the first com rule */ - best_com_p = com_p = alg_p->com_rules; - i = 0; - - while( i < alg_p->n_com_sizes ) { - if (com_p->mpi_comsize > mpi_comsize) { - break; - } - best_com_p = com_p; - /* go to the next entry */ - com_p++; - i++; - } - - OPAL_OUTPUT((ompi_coll_tuned_stream,"Selected the following com rule id %d\n", best_com_p->com_rule_id)); - ompi_coll_tuned_dump_com_rule (best_com_p); - - return (best_com_p); -} - -/* - * This function takes a com_rule ptr (from the communicators coll tuned data structure) - * (Which is chosen for a particular MPI collective) - * and a (total_)msg_size and it returns (0) and a algorithm to use and a recommended topo faninout and segment size - * all based on the user supplied rules - * - * Just like the above functions it uses a less than or equal msg size - * (hense config file must have a default defined for '0' if we reach this point) - * else if no rules match we return '0' + '0,0' or used fixed decision table with no topo chand and no segmentation - * of users data.. shame. - * - * On error return 0 so we default to fixed rules anyway :) - * - */ - -int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rule, size_t mpi_msgsize, int *result_topo_faninout, - int* result_segsize, int* max_requests) -{ - ompi_coll_msg_rule_t* msg_p = (ompi_coll_msg_rule_t*) NULL; - ompi_coll_msg_rule_t* best_msg_p = (ompi_coll_msg_rule_t*) NULL; - int i; - - /* No rule or zero rules */ - if( (NULL == base_com_rule) || (0 == base_com_rule->n_msg_sizes)) { - return (0); - } - - /* ok have some msg sizes, now to find the one closest to my mpi_msgsize */ - - /* make a copy of the first msg rule */ - best_msg_p = msg_p = base_com_rule->msg_rules; - i = 0; - - while (in_msg_sizes) { - /* OPAL_OUTPUT((ompi_coll_tuned_stream,"checking mpi_msgsize %d against com_id %d msg_id %d index %d msg_size %d", */ - /* mpi_msgsize, msg_p->com_rule_id, msg_p->msg_rule_id, i, msg_p->msg_size)); */ - if (msg_p->msg_size <= mpi_msgsize) { - best_msg_p = msg_p; - /* OPAL_OUTPUT((ompi_coll_tuned_stream(":ok\n")); */ - } - else { - /* OPAL_OUTPUT((ompi_coll_tuned_stream(":nop\n")); */ - break; - } - /* go to the next entry */ - msg_p++; - i++; - } - - OPAL_OUTPUT((ompi_coll_tuned_stream,"Selected the following msg rule id %d\n", best_msg_p->msg_rule_id)); - ompi_coll_tuned_dump_msg_rule (best_msg_p); - - /* return the segment size */ - *result_topo_faninout = best_msg_p->result_topo_faninout; - - /* return the segment size */ - *result_segsize = best_msg_p->result_segsize; - - /* return the maximum requests */ - *max_requests = best_msg_p->result_max_requests; - - /* return the algorithm/method to use */ - return (best_msg_p->result_alg); -} diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.h b/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.h deleted file mode 100644 index 7e8f672d21d..00000000000 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.h +++ /dev/null @@ -1,104 +0,0 @@ - -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2011-2012 FUJITSU LIMITED. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_COLL_TUNED_DYNAMIC_RULES_H_HAS_BEEN_INCLUDED -#define MCA_COLL_TUNED_DYNAMIC_RULES_H_HAS_BEEN_INCLUDED - -#include "ompi_config.h" - -BEGIN_C_DECLS - - -typedef struct msg_rule_s { - /* paranoid / debug */ - int mpi_comsize; /* which MPI comm size this is is for */ - - /* paranoid / debug */ - int alg_rule_id; /* unique alg rule id */ - int com_rule_id; /* unique com rule id */ - int msg_rule_id; /* unique msg rule id */ - - /* RULE */ - size_t msg_size; /* message size */ - - /* RESULT */ - int result_alg; /* result algorithm to use */ - int result_topo_faninout; /* result topology fan in/out to use (if applicable) */ - long result_segsize; /* result segment size to use */ - int result_max_requests; /* maximum number of outstanding requests (if applicable) */ -} ompi_coll_msg_rule_t; - - -typedef struct com_rule_s { - /* paranoid / debug */ - int mpi_comsize; /* which MPI comm size this is is for */ - - /* paranoid / debug */ - int alg_rule_id; /* unique alg rule id */ - int com_rule_id; /* unique com rule id */ - - /* RULE */ - int n_msg_sizes; - ompi_coll_msg_rule_t *msg_rules; - -} ompi_coll_com_rule_t; - - -typedef struct alg_rule_s { - /* paranoid / debug */ - int alg_rule_id; /* unique alg rule id */ - - /* RULE */ - int n_com_sizes; - ompi_coll_com_rule_t *com_rules; - -} ompi_coll_alg_rule_t; - -/* function prototypes */ - -/* these are used to build the rule tables (by the read file routines) */ -ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg); -ompi_coll_com_rule_t* ompi_coll_tuned_mk_com_rules (int n_com_rules, int alg_rule_id); -ompi_coll_msg_rule_t* ompi_coll_tuned_mk_msg_rules (int n_msg_rules, int alg_rule_id, int com_rule_id, int mpi_comsize); - -/* debugging support */ -int ompi_coll_tuned_dump_msg_rule (ompi_coll_msg_rule_t* msg_p); -int ompi_coll_tuned_dump_com_rule (ompi_coll_com_rule_t* com_p); -int ompi_coll_tuned_dump_alg_rule (ompi_coll_alg_rule_t* alg_p); -int ompi_coll_tuned_dump_all_rules (ompi_coll_alg_rule_t* alg_p, int n_rules); - -/* free alloced memory routines, used by file and tuned component/module */ -int ompi_coll_tuned_free_msg_rules_in_com_rule (ompi_coll_com_rule_t* com_p); -int ompi_coll_tuned_free_coms_in_alg_rule (ompi_coll_alg_rule_t* alg_p); -int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs); - - -/* the IMPORTANT routines, i.e. the ones that do stuff for everyday communicators and collective calls */ - -ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* rules, int alg_id, int mpi_comsize); - -int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rule, size_t mpi_msgsize, - int* result_topo_faninout, int* result_segsize, - int* max_requests); - - -END_C_DECLS -#endif /* MCA_COLL_TUNED_DYNAMIC_RULES_H_HAS_BEEN_INCLUDED */ - diff --git a/ompi/mca/coll/tuned/coll_tuned_module.c b/ompi/mca/coll/tuned/coll_tuned_module.c index bf2c7da1434..8daf9877daa 100644 --- a/ompi/mca/coll/tuned/coll_tuned_module.c +++ b/ompi/mca/coll/tuned/coll_tuned_module.c @@ -13,6 +13,7 @@ * Copyright (c) 2016 Intel, Inc. All rights reserved. * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2020 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,8 +32,9 @@ #include "ompi/mca/coll/base/base.h" #include "ompi/mca/coll/base/coll_base_topo.h" #include "coll_tuned.h" -#include "coll_tuned_dynamic_rules.h" -#include "coll_tuned_dynamic_file.h" +#include "ompi/mca/coll/base/coll_base_dynamic_rules.h" +#include "ompi/mca/coll/base/coll_base_dynamic_file.h" +#include "ompi/mca/coll/base/coll_base_util.h" static int tuned_module_enable(mca_coll_base_module_t *module, struct ompi_communicator_t *comm); @@ -159,8 +161,8 @@ ompi_coll_tuned_forced_getvalues( enum COLLTYPE type, } \ if( NULL != mca_coll_tuned_component.all_base_rules ) { \ (TMOD)->com_rules[(TYPE)] \ - = ompi_coll_tuned_get_com_rule_ptr( mca_coll_tuned_component.all_base_rules, \ - (TYPE), size ); \ + = ompi_coll_base_get_com_rule_ptr( mca_coll_tuned_component.all_base_rules, \ + (TYPE), nnodes, size ); \ if( NULL != (TMOD)->com_rules[(TYPE)] ) { \ need_dynamic_decision = 1; \ } \ @@ -178,7 +180,9 @@ static int tuned_module_enable( mca_coll_base_module_t *module, struct ompi_communicator_t *comm ) { - int size; + /* Variables used in COLL_TUNED_EXECUTE_IF_DYNAMIC macro */ + int size, nnodes; + mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t *) module; mca_coll_base_comm_t *data = NULL; @@ -191,6 +195,9 @@ tuned_module_enable( mca_coll_base_module_t *module, size = ompi_comm_size(comm); } + /* Get the number of nodes in communicator */ + nnodes = ompi_coll_base_get_nnodes(comm); + OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init nnodes %d.", nnodes)); /** * we still malloc data as it is used by the TUNED modules * if we don't allocate it and fall back to a BASIC module routine then confuses debuggers diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c index 40e500d1c04..c1b7de20d69 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c @@ -31,6 +31,7 @@ static int coll_tuned_reduce_segment_size = 0; static int coll_tuned_reduce_max_requests; static int coll_tuned_reduce_tree_fanout; static int coll_tuned_reduce_chain_fanout; +static bool coll_tuned_reduce_allow_non_commutative_support = true; /* valid values for coll_tuned_reduce_forced_algorithm */ static const mca_base_var_enum_value_t reduce_algorithms[] = { @@ -140,6 +141,19 @@ int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_m } coll_tuned_reduce_max_requests = 0; } + /* Add a MCA parameter to enable/disable discarding of algorithm in case of non commutative operations. + * When algorithm configuration file (including reduce definition) or reduce_algorithm MCA parameter are used + * loaded algorithms may not support non commutative operations. Consequently, an issue happens when + * the operation argument of the MPI_Reduce call is non commutative. To avoid this strong limitation, we provide + * a discarding mechanism on top of algorithm selection to force the use of a fallback algorithm. This mechanism + * can be enable/disable using the following MCA parameter. */ + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "reduce_allow_non_commutative_support", + "Switch to allow non commutative operations in reduce algorithms designed for only commutative operations. Be carefull, enabling this parameter may lead to erroneous numerical results.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &coll_tuned_reduce_allow_non_commutative_support); return (MPI_SUCCESS); } @@ -155,6 +169,25 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, int count OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d", algorithm, faninout, segsize)); + if(!coll_tuned_reduce_allow_non_commutative_support) { + /* fallback algorithm mechanism */ + /* If the operation is non commutative and algorithm is neither basic linear nor ignore */ + if(!ompi_op_is_commute(op) && 1 != algorithm && 0 != algorithm) { + /* If algorithm is in-order binary with segmentation */ + if(6 == algorithm) { + if (0 != segsize) { + opal_output_verbose(5,ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this in_order_binary: segmentation can't be enabled when op is not commutative"); + /* disabling segmentation is enough */ + segsize = 0; + } + } else { + /* Otherwise we have to restrict selection to linear or in-order_binary */ + opal_output_verbose(5,ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this algorithm id %d can't be chosen when op is not commutative", algorithm); + return ompi_coll_tuned_reduce_intra_dec_fixed(sbuf, rbuf, count, dtype, op, root, comm, module); + } + } + } + switch (algorithm) { case (0): return ompi_coll_tuned_reduce_intra_dec_fixed(sbuf, rbuf, count, dtype, op, root, comm, module); diff --git a/ompi/mca/coll/tuned/help-mpi-coll-tuned.txt b/ompi/mca/coll/tuned/help-mpi-coll-tuned.txt new file mode 100644 index 00000000000..7efcac8c076 --- /dev/null +++ b/ompi/mca/coll/tuned/help-mpi-coll-tuned.txt @@ -0,0 +1,219 @@ +# -*- text -*- +# +# Copyright (c) 2019 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for Open MPI's tuned component +# (which use base collective implementations and config file parser). +# +[file fail] +The coll/tuned module was unable to load rules contained in file %s. +The rule configuration format specified was %d. +For further informations on occurred error, +you can enable debug output by setting verbosity level as follows: + +export OMPI_MCA_coll_tuned_verbose=1 + +Error is potentially due to: + +* File missing. + Check if the file path defined by the mca parameter + OMPI_MCA_coll_tuned_dynamic_rules_filename is valid. + +* format mismatch. + Verify the value of the mca parameter OMPI_MCA_coll_tuned_dynamic_rules_fileformat. + 0: Rules are defined according to a combination of + collective_id, communicator size and message size. + For example: + + 1 # collective number + 9 # gather (1st level) + 2 # comm size config number + 1 # Ranks number >= 1 (2nd level) + 1 # msg size config number + 0 1 0 0 # Msgsize >= 0 (3rd level): basic linear + 8 # Ranks number >= 8 (2nd level) + 3 # msg size config number + 0 1 0 0 # Msgsize >= 0 (3rd level): basic linear + 8192 2 0 0 # Msgsize >= 8192 (3rd level): binomial + 4194304 3 0 1024 # Msgsize >= 4194304 (3rd level): linearsync (segsize of 1024 bytes) + + 1: Rules are defined according to a combination of + collective_id, communicator nodes number, communicator size and message size. + For example: + 1 # collective number + 9 # gather (1st level) + 2 # nodes nb config number + 1 # nodes number >= 1 (2nd level) + 2 # comm size config number + 1 # Ranks number >= 1 (3rd level) + 1 # msg size config number + 0 1 0 0 # Msgsize >= 0 (4th level): basic linear + 8 # Ranks number >= 8 (3rd level) + 3 # msg size config number + 0 1 0 0 # Msgsize >= 0 (4th level): basic linear + 8192 2 0 0 # Msgsize >= 8192 (4th level): binomial + 4194304 3 0 1024 # Msgsize >= 4194304 (4th level): linearsync (segsize of 1024 bytes) + 8 # nodes number >= 8 (2nd level) + 1 # comm size config number + 8 # ranks number >= 8 (3rd level) + 3 # msg size config number + 0 1 0 0 # Msgsize >= 0 (4th level): basic linear + 8192 2 0 0 # Msgsize >= 8192 (4th level): binomial + 4194304 3 0 1024 # Msgsize >= 4194304 (4th level): linearsync (segsize of 1024 bytes) + +* Incorrect collective ID(s). + Check if collective IDs on the config file match with the following list: + ALLGATHER = 0, ALLGATHERV = 1, ALLREDUCE = 2, ALLTOALL = 3, ALLTOALLV = 4, ALLTOALLW = 5, + BARRIER = 6, BCAST = 7, EXSCAN = 8, GATHER = 9, GATHERV = 10, + REDUCE = 11, REDUCESCATTER = 12, REDUCESCATTERBLOCK = 13, SCAN = 14, + SCATTER = 15, SCATTERV = 16, NEIGHBOR_ALLGATHER = 17, NEIGHBOR_ALLGATHERV = 18, + NEIGHBOR_ALLTOALL = 19, NEIGHBOR_ALLTOALLV = 20, NEIGHBOR_ALLTOALLW = 21. + +This error leads to tuned component to ignore the provided dynamic rules. +Once error has been fixed you can check obtained rules using verbosity. +The following verbosity level threshold are defined: +OMPI_MCA_coll_tuned_verbose >= 10 (basic summary print) +OMPI_MCA_coll_tuned_verbose >= 50 (rules print) +OMPI_MCA_coll_tuned_verbose >= 100 (file parsing logs print) +# +[file fail-1] + +The coll/tuned module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The coll file parser was called with a null filename string. +This is an internal error. Parser should be used only if a rules filename was set by user. + +This error leads to tuned component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-2] + +The coll/tuned module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The coll file parser was called with a null rules pointer. +This is an internal error. Rules object pointer must be valid. + +This error leads to tuned component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-3] + +The coll/tuned module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The coll file parser was called with an invalid max collectives number. +This is an internal error. Max collectives number must be greater than 0. + +This error leads to tuned component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-4] + +The coll/tuned module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The file can't be opened. Either file is missing or access rights are wrong. +Check if the file path defined by the mca parameter +OMPI_MCA_coll_tuned_dynamic_rules_filename is valid. + +This error leads to tuned component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-5] + +The coll/tuned module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The file format is unknown. It must be either + 0: Rules are defined according to a combination of + collective_id, communicator size and message size + or 1: Rules are defined according to a combination of + collective_id, communicator nodes number, communicator size and message size. + +This error leads to tuned component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-6] + +The coll/tuned module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +Rules object allocation failed. This can be induced by a memory resource exhaustion. + +This error leads to tuned component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-7] + +The coll/tuned module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +The file parsing aborted. This can be induced by either: + a format mismatch + or a missing line + or an invalid configuration number. + +This error leads to tuned component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-8] + +The coll/tuned module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +Unconsistent collective id or collective number. Collective number can't be greater than 22 and collective IDs belong to [0,21]. +As a reminder, the collectives IDs are listed below: +ALLGATHER = 0, ALLGATHERV = 1, ALLREDUCE = 2, +ALLTOALL = 3, ALLTOALLV = 4, ALLTOALLW = 5, +BARRIER = 6, BCAST = 7, EXSCAN = 8, GATHER = 9, GATHERV = 10, +REDUCE = 11, REDUCESCATTER = 12, REDUCESCATTERBLOCK = 13, SCAN = 14, +SCATTER = 15, SCATTERV = 16, NEIGHBOR_ALLGATHER = 17, NEIGHBOR_ALLGATHERV = 18, +NEIGHBOR_ALLTOALL = 19, NEIGHBOR_ALLTOALLV = 20, NEIGHBOR_ALLTOALLW = 21. + +This error leads to tuned component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +# +[file fail-9] + +The coll/tuned module was unable to load rules contained in file %s. +The rule configuration format specified was %d. + +Incosistent collective ID in rules object. This is an internal error. + +This error leads to tuned component to ignore the provided dynamic rules. +Internal logs can be enable using verbose MCA parameter OMPI_MCA_coll_base_verbose. +OMPI_MCA_coll_base_verbose = 1 (errors) +OMPI_MCA_coll_base_verbose >= 50 (file based rules selection print) +OMPI_MCA_coll_base_verbose >= 100 (file parsing logs print) +