Skip to content

Commit

Permalink
Merge pull request open-mpi#11710 from wenduwan/fix_package_rank
Browse files Browse the repository at this point in the history
opal/ofi: follow up fixes to package rank calculation
  • Loading branch information
lrbison authored May 24, 2023
2 parents 79ea49f + 1a1b84a commit 18a7064
Showing 1 changed file with 14 additions and 7 deletions.
21 changes: 14 additions & 7 deletions opal/mca/common/ofi/common_ofi.c
Original file line number Diff line number Diff line change
Expand Up @@ -713,7 +713,7 @@ static int count_providers(struct fi_info *provider_list)
*/
static uint32_t get_package_rank(opal_process_info_t *process_info)
{
int i;
int i, level = 10;
uint16_t relative_locality, *package_rank_ptr;
uint32_t ranks_on_package = 0;
opal_process_name_t pname;
Expand Down Expand Up @@ -752,10 +752,6 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
for (i = 0; NULL != peers[i]; i++) {
pname.vpid = strtoul(peers[i], NULL, 10);

if ((uint16_t) pname.vpid == process_info->my_local_rank) {
return ranks_on_package;
}

locality_string = NULL;
// Get the LOCALITY_STRING for process[i]
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, &pname, &locality_string,
Expand All @@ -769,12 +765,22 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
locality_string);
free(locality_string);

if ((uint16_t) pname.vpid == process_info->my_local_rank) {
return ranks_on_package;
}

if (relative_locality & OPAL_PROC_ON_SOCKET) {
ranks_on_package++;
}
}
err:
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
if (opal_output_get_verbosity(opal_common_ofi.output) >= level) {
opal_show_help("help-common-ofi.txt", "package_rank failed", true, level);
}

if (locality_string)
free(locality_string);

return (uint32_t) process_info->myprocid.rank;
}

Expand All @@ -793,6 +799,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
int ret;
unsigned int num_provider = 0, provider_limit = 0;
bool provider_found = false;
uint32_t package_rank = 0;

/* Initialize opal_hwloc_topology if it is not already */
ret = opal_hwloc_base_get_topology();
Expand Down Expand Up @@ -853,9 +860,9 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
}

/* Select provider from local rank % number of providers */
uint32_t package_rank = get_package_rank(process_info);
if (num_provider >= 2) {
// If there are multiple NICs "close" to the process, try to calculate package_rank
package_rank = get_package_rank(process_info);
provider = provider_table[package_rank % num_provider];
} else if (num_provider == 1) {
provider = provider_table[num_provider - 1];
Expand Down

0 comments on commit 18a7064

Please sign in to comment.