From d29a19bf98d604a4b240879e3bbc03ed38aaae8f Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Wed, 7 Jun 2023 01:23:55 +0000 Subject: [PATCH 1/3] opal/common/ofi: refactor NIC selection logic This patch refactors the OFI NIC selection logic. It foremost improves the NIC search algorithm. Instead of searching for the closest NICs on the system, this patch directly compares the distances of the given providers and selects the nearest NIC. This change also makes it explicit that if the process is unbound, or the distance cannot be reliably calculated, a provider will be selected in round-robin fashion. Signed-off-by: Wenduo Wang (cherry picked from commit f5f3b93483958ce6196cb394635093d6673fbd92) --- opal/mca/common/ofi/common_ofi.c | 412 +++++++++++++++---------------- 1 file changed, 200 insertions(+), 212 deletions(-) diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index d9888f23f22..c7a9fabd3e9 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -487,12 +487,24 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr && !check_ep_attr(provider_info->ep_attr, provider->ep_attr) && !(provider_info->caps & ~(provider->caps)) && !(provider_info->mode & ~(provider->mode)) && provider_info->addr_format == provider->addr_format) { - return 0; + return OPAL_SUCCESS; } else { return OPAL_ERROR; } } +#if OPAL_OFI_PCI_DATA_AVAILABLE +static int get_provider_nic_pci(struct fi_info *provider, struct fi_pci_attr *pci) +{ + if (NULL != provider->nic && NULL != provider->nic->bus_attr + && FI_BUS_PCI == provider->nic->bus_attr->bus_type) { + *pci = provider->nic->bus_attr->attr.pci; + return OPAL_SUCCESS; + } + return OPAL_ERR_NOT_AVAILABLE; +} +#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ + /** * Calculate device distances * @@ -557,177 +569,220 @@ static int compute_dev_distances(pmix_device_distance_t **distances, } /** - * Find the nearest devices to the current thread + * @brief Get the provider distance from the provided distance metrics + * + * @param[in] topology hwloc topology + * @param[in] provider Provider object + * @param[in] distances List of known device distances + * @param[in] num_distances Length of distances + * @param[out] distance Pointer to store the provider distance + * @return OPAL_SUCCESS if and only if the distance is found in the provided list + */ +#if OPAL_OFI_PCI_DATA_AVAILABLE +static int get_provider_distance(hwloc_topology_t topology, struct fi_info *provider, + pmix_device_distance_t *distances, int num_distances, + uint16_t *distance) +{ + hwloc_obj_t pcidev, osdev; + struct fi_pci_attr pci = {0}; + + if (OPAL_SUCCESS != get_provider_nic_pci(provider, &pci)) { + opal_output_verbose(1, opal_common_ofi.output, "Cannot determine PCI attributes of provider %s", + provider->domain_attr->name); + return OPAL_ERROR; + } + + pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id, + pci.function_id); + if (!pcidev) { + opal_output_verbose(1, opal_common_ofi.output, "Cannot locate PCI device of provider %s", + provider->domain_attr->name); + return OPAL_ERROR; + } + +#if HWLOC_API_VERSION < 0x00020000 + osdev = pcidev->first_child; +#else + osdev = pcidev->io_first_child; +#endif /* HWLOC_API_VERSION */ + for (; osdev != NULL; osdev = osdev->next_sibling) { + int i; + + if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { + const char *nguid = hwloc_obj_get_info_by_name(osdev, "NodeGUID"); + const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID"); + + if (!nguid && !sguid) + continue; + + for (i = 0; i < num_distances; i++) { + char lsguid[20], lnguid[20]; + int ret; + + if (PMIX_DEVTYPE_OPENFABRICS != distances[i].type) { + continue; + } + + if (!distances[i].osname || !osdev->name + || strcmp(distances[i].osname, osdev->name)) + continue; + + ret = sscanf(distances[i].uuid, "fab://%19s::%19s", lnguid, lsguid); + if (ret != 2) + continue; + + if ((nguid && (0 == strcasecmp(lnguid, nguid))) + || (sguid && (0 == strcasecmp(lsguid, sguid)))) { + *distance = distances[i].mindist; + return OPAL_SUCCESS; + } + } + } else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) { + const char *address = hwloc_obj_get_info_by_name(osdev, "Address"); + if (!address) + continue; + for (i = 0; i < num_distances; i++) { + if (PMIX_DEVTYPE_NETWORK != distances[i].type) { + continue; + } + char *addr = strstr(distances[i].uuid, "://"); + if (!addr || addr + 3 > distances[i].uuid + strlen(distances[i].uuid)) + continue; + if (!strcmp(addr + 3, address)) { + *distance = distances[i].mindist; + return OPAL_SUCCESS; + } + } + } + } + + return OPAL_ERROR; +} +#else +static int get_provider_distance(struct fi_info *provider, hwloc_topology_t topology, + pmix_device_distance_t *distances, size_t num_distances, + uint16_t *distance) +{ + return OPAL_ERROR; +} +#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ + +/** + * @brief Get the nearest device to the current thread * * Use the PMIx server or calculate the device distances, then out of the set of * returned distances find the subset of the nearest devices. This can be - * 1 or more. - * - * @param num_distances (OUT) number of entries in the returned array + * 0 or more. + * If there are multiple equidistant devices, break the tie using the rank. * - * @return An array of device distances which are nearest this thread - * or NULL if we fail to get the distances. In this case we will just - * revert to round robin. + * @param[in] topoloy hwloc topology + * @param[in] provider_list List of providers to select from + * @param[in] num_providers Number of providers in provider_list + * @param[in] rank local rank of the process + * @param[out] provider pointer to the selected provider * + * @return OPAL_SUCCESS if and only if a nearest provider is found. */ -static pmix_device_distance_t * -get_nearest_nics(int *num_distances, pmix_value_t **valin) +static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_list, + size_t num_providers, uint32_t rank, struct fi_info **provider) { - size_t ndist, i; - int ret, idx = 0; + int ret; pmix_data_array_t *dptr; - uint16_t near = USHRT_MAX; + pmix_device_distance_t *distances; pmix_info_t directive; pmix_value_t *val = NULL; - pmix_device_distance_t *distances, *nearest = NULL; + size_t ndist, num_nearest = 0; + struct fi_info *current_provider = NULL; + uint16_t dists[num_providers], *dist = NULL, min_dist = USHRT_MAX; + uint32_t provider_rank = 0; PMIx_Info_load(&directive, PMIX_OPTIONAL, NULL, PMIX_BOOL); - ret = PMIx_Get(&opal_process_info.myprocid, - PMIX_DEVICE_DISTANCES, &directive, 1, &val); + ret = PMIx_Get(&opal_process_info.myprocid, PMIX_DEVICE_DISTANCES, &directive, 1, &val); PMIx_Info_destruct(&directive); if (ret != PMIX_SUCCESS || !val) { ret = compute_dev_distances(&distances, &ndist); if (ret) { + ret = OPAL_ERROR; goto out; } goto find_nearest; } if (PMIX_DATA_ARRAY != val->type) { + ret = OPAL_ERROR; goto out; } dptr = val->data.darray; if (NULL == dptr) { + ret = OPAL_ERROR; goto out; } if (PMIX_DEVICE_DIST != dptr->type) { + ret = OPAL_ERROR; goto out; } - distances = (pmix_device_distance_t*)dptr->array; + distances = (pmix_device_distance_t *) dptr->array; ndist = dptr->size; find_nearest: - nearest = calloc(sizeof(*distances), ndist); - if (!nearest) { - goto out; - } - - for (i = 0; i < ndist; i++) { - if (distances[i].type != PMIX_DEVTYPE_NETWORK && - distances[i].type != PMIX_DEVTYPE_OPENFABRICS) + for (current_provider = provider_list, dist = dists; NULL != current_provider; + current_provider = current_provider->next, ++dist) { + if (OPAL_SUCCESS != check_provider_attr(provider_list, current_provider)) { continue; - if (distances[i].mindist < near) { - idx = 0; - near = distances[i].mindist; - nearest[idx] = distances[i]; - idx++; - } else if (distances[i].mindist == near) { - nearest[idx] = distances[i]; - idx++; + } + if (OPAL_SUCCESS != get_provider_distance(topology, current_provider, distances, ndist, dist)) { + *dist = USHRT_MAX; + } + + if (*dist < min_dist) { + min_dist = *dist; + num_nearest = 1; + } else if (*dist == min_dist) { + ++num_nearest; + } + + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)) { + opal_output_verbose(1, opal_common_ofi.output, "provider: %s dist: %d", + current_provider->domain_attr->name, *dist); } } - *num_distances = idx; + ret = OPAL_ERROR; + if (0 >= num_nearest) { + return ret; + } + provider_rank = rank % num_nearest; + num_nearest = 0; + for (current_provider = provider_list, dist = dists; NULL != current_provider; + current_provider = current_provider->next) { + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider) + && min_dist == *(dist++) && provider_rank == num_nearest++) { + *provider = current_provider; + ret = OPAL_SUCCESS; + goto out; + } + } out: - *valin = val; - return nearest; -} + if (val) + PMIx_Value_free(val, 1); -#if OPAL_OFI_PCI_DATA_AVAILABLE -/** - * Determine if a device is nearest - * - * Given a device distances array of the nearest pci devices, - * determine if one of these device distances refers to the pci - * device passed in - * - * @param distances (IN) distances array - * @param num_distances (IN) number of entries in the distances array - * @param topology (IN) topology of the node - * @param pci (IN) PCI device being examined - * - * @return true if the PCI device is in the distances array or if the - * distances array is not provided. False otherwise. - * - */ -#if HWLOC_API_VERSION < 0x00020000 -static bool is_near(pmix_device_distance_t *distances, - int num_distances, - hwloc_topology_t topology, - struct fi_pci_attr pci) -{ - return true; + return ret; } -#else -static bool is_near(pmix_device_distance_t *distances, - int num_distances, - hwloc_topology_t topology, - struct fi_pci_attr pci) -{ - hwloc_obj_t pcidev, osdev; - - /* if we failed to find any distances, then we consider all interfaces - * to be of equal distances and let the caller decide how to handle - * them - */ - if (!distances) - return true; - - pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id, - pci.bus_id, pci.device_id, - pci.function_id); - if (!pcidev) - return false; - - for(osdev = pcidev->io_first_child; osdev != NULL; osdev = osdev->next_sibling) { - int i; - if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { - const char *nguid = hwloc_obj_get_info_by_name(osdev,"NodeGUID"); - const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID"); - - if (!nguid && !sguid) - continue; - - for (i = 0; i < num_distances; i++) { - char lsguid[20], lnguid[20]; - int ret; - - if (!distances[i].osname || !osdev->name - || strcmp(distances[i].osname, osdev->name)) - continue; +static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank, + size_t num_providers) +{ + uint32_t provider_rank = rank % num_providers; + struct fi_info *current_provider = provider_list; - ret = sscanf(distances[i].uuid, "fab://%19s::%19s", lnguid, lsguid); - if (ret != 2) - continue; - if (nguid && (0 == strcasecmp(lnguid, nguid))) { - return true; - } else if (sguid && (0 == strcasecmp(lsguid, sguid))) { - return true; - } - } - } else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) { - const char *address = hwloc_obj_get_info_by_name(osdev, "Address"); - if (!address) - continue; - for (i = 0; i < num_distances; i++) { - char *addr = strstr(distances[i].uuid, "://"); - if (!addr || addr + 3 > distances[i].uuid - + strlen(distances[i].uuid)) - continue; - if (!strcmp(addr+3, address)) { - return true; - } - } - } + for (uint32_t i = 0; i < provider_rank; ++i) { + current_provider = current_provider->next; } - return false; + return current_provider; } -#endif -#endif // OPAL_OFI_PCI_DATA_AVAILABLE static int count_providers(struct fi_info *provider_list) { @@ -829,108 +884,41 @@ static uint32_t get_package_rank(opal_process_info_t *process_info) } struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, - opal_process_info_t *process_info) + opal_process_info_t *process_info) { - struct fi_info *provider = provider_list, *current_provider = provider_list; - struct fi_info **provider_table; -#if OPAL_OFI_PCI_DATA_AVAILABLE - pmix_device_distance_t *distances = NULL; - pmix_value_t *pmix_val; - struct fi_pci_attr pci; - int num_distances = 0; -#endif - bool near = false; - int ret; - unsigned int num_provider = 0, provider_limit = 0; - bool provider_found = false; + int ret, num_providers = 0; + struct fi_info *provider = NULL; + uint32_t package_rank = 0; + + num_providers = count_providers(provider_list); + if (!process_info->proc_is_bound || 2 > num_providers) { + goto round_robin; + } /* Initialize opal_hwloc_topology if it is not already */ ret = opal_hwloc_base_get_topology(); if (0 > ret) { /* Provider selection can continue but there is no guarantee of locality */ - opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Failed to initialize topology\n", + opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Failed to initialize topology", __FILE__, __LINE__); } - provider_limit = count_providers(provider_list); - - /* Allocate memory for provider table */ - provider_table = calloc(provider_limit, sizeof(struct fi_info *)); - if (NULL == provider_table) { - opal_output_verbose(1, opal_common_ofi.output, - "%s:%d:Failed to allocate memory for provider table\n", __FILE__, - __LINE__); - return provider_list; - } + package_rank = get_package_rank(process_info); #if OPAL_OFI_PCI_DATA_AVAILABLE - /* find all the nearest devices to this thread, then out of these - * determine which device we should bind to. - */ - distances = get_nearest_nics(&num_distances, &pmix_val); -#endif - - current_provider = provider; - - /* Cycle through remaining fi_info objects, looking for alike providers */ - while (NULL != current_provider) { - if (!check_provider_attr(provider, current_provider)) { - near = false; -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (NULL != current_provider->nic - && NULL != current_provider->nic->bus_attr - && current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) { - pci = current_provider->nic->bus_attr->attr.pci; - near = is_near(distances, num_distances, - opal_hwloc_topology, pci); - } -#endif - /* We could have multiple near providers */ - if (near && !provider_found) { - provider_found = true; - num_provider = 0; - } - - /* Add the provider to the provider list if the cpusets match or if - * no other provider was found on the same cpuset as the process. - */ - if (near || !provider_found) { - provider_table[num_provider] = current_provider; - num_provider++; - } - } - current_provider = current_provider->next; - } - - /* Select provider from local rank % number of providers */ - uint32_t package_rank = get_package_rank(process_info); - if (num_provider >= 2) { - // If there are multiple NICs "close" to the process, try to calculate package_rank - provider = provider_table[package_rank % num_provider]; - } else if (num_provider == 1) { - provider = provider_table[num_provider - 1]; - } - -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (NULL != provider->nic - && NULL != provider->nic->bus_attr - && provider->nic->bus_attr->bus_type == FI_BUS_PCI) { - pci = provider->nic->bus_attr->attr.pci; - near = is_near(distances, num_distances, - opal_hwloc_topology, pci); + ret = get_nearest_nic(opal_hwloc_topology, provider_list, num_providers, package_rank, + &provider); + if (OPAL_SUCCESS == ret) { + goto out; } -#endif +#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ +round_robin: + provider = select_provider_round_robin(provider_list, package_rank, num_providers); +out: #if OPAL_ENABLE_DEBUG - opal_output_verbose(1, opal_common_ofi.output, - "package rank: %d device: %s near: %s\n", package_rank, - provider->domain_attr->name, near ? "true" : "false"); -#endif - - free(provider_table); -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (pmix_val) - PMIx_Value_free(pmix_val, 1); + opal_output_verbose(1, opal_common_ofi.output, "package rank: %d device: %s", package_rank, + provider->domain_attr->name); #endif return provider; } From f9800fdecfd60539e0687009ecc09ff729e3d642 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Fri, 1 Dec 2023 16:43:23 -0800 Subject: [PATCH 2/3] opal/ofi: fix round-robin selection logic This change fixes current round-robin selection logic: - Only providers of the same type should be considered, i.e. providers that match the head of the list. This deviates from the documented behavior. - For unbound process the selection should be based on its local rank, i.e. rank among processes on the same node. Currently only the first NIC will be selected. Signed-off-by: Wenduo Wang (cherry picked from commit b061f96156b56d7990756e3e2759708fcf065dad) --- opal/mca/common/ofi/common_ofi.c | 51 ++++++++++++++++++++----- opal/mca/common/ofi/help-common-ofi.txt | 6 +++ 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index c7a9fabd3e9..3b25fee02c6 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -771,16 +771,46 @@ static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_l return ret; } -static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank, - size_t num_providers) +/** + * @brief Selects a provider from the list in a round-robin fashion + * + * This function implements a round-robin algorithm to select a provider from + * the provided list based on a rank. Only providers of the same type as the + * first provider are eligible for selection. + * + * @param[in] provider_list A list of providers to select from. + * @param[out] rank A rank metric for the current process, such as + * the rank on the same node or CPU package. + * @return Pointer to the selected provider + */ +static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank) { - uint32_t provider_rank = rank % num_providers; - struct fi_info *current_provider = provider_list; + uint32_t provider_rank = 0, current_rank = 0; + size_t num_providers = 0; + struct fi_info *current_provider = NULL; - for (uint32_t i = 0; i < provider_rank; ++i) { + for (current_provider = provider_list; NULL != current_provider;) { + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)) { + ++num_providers; + } current_provider = current_provider->next; } + current_provider = provider_list; + if (2 > num_providers) { + goto out; + } + + provider_rank = rank % num_providers; + + while (NULL != current_provider) { + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider) + && provider_rank == current_rank++) { + break; + } + current_provider = current_provider->next; + } +out: return current_provider; } @@ -888,7 +918,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, { int ret, num_providers = 0; struct fi_info *provider = NULL; - uint32_t package_rank = 0; + uint32_t package_rank = process_info->my_local_rank; num_providers = count_providers(provider_list); if (!process_info->proc_is_bound || 2 > num_providers) { @@ -914,7 +944,12 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ round_robin: - provider = select_provider_round_robin(provider_list, package_rank, num_providers); + if (!process_info->proc_is_bound && 1 < num_providers + && opal_output_get_verbosity(opal_common_ofi.output) >= 1) { + opal_show_help("help-common-ofi.txt", "unbound_process", true, 1); + } + + provider = select_provider_round_robin(provider_list, package_rank); out: #if OPAL_ENABLE_DEBUG opal_output_verbose(1, opal_common_ofi.output, "package rank: %d device: %s", package_rank, @@ -988,5 +1023,3 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add } return ret; } - - diff --git a/opal/mca/common/ofi/help-common-ofi.txt b/opal/mca/common/ofi/help-common-ofi.txt index 44366a64c5f..de3630f7e7a 100644 --- a/opal/mca/common/ofi/help-common-ofi.txt +++ b/opal/mca/common/ofi/help-common-ofi.txt @@ -7,6 +7,12 @@ # # $HEADER$ # +[unbound_process] +Open MPI's OFI driver detected multiple NICs on the system but cannot select an +optimal device because the current process is not bound. This may negatively +impact performance. This can be resolved by specifying "--bind-to ..." on +command line. + [package_rank failed] Open MPI's OFI driver detected multiple equidistant NICs from the current process, but had insufficient information to ensure MPI processes fairly pick a NIC for use. From 1b1dd859c69f952d7ef71925bc4730e043ae0b5f Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Fri, 1 Dec 2023 16:38:47 -0800 Subject: [PATCH 3/3] opal/ofi: update nic selection function doc The documentation needs an update to reflect latest implementation. The original cpuset matching logic has been replaced with a new distance calculation algorithm. This change also clarifies the round-robin selection process when we need to break a tie. Signed-off-by: Wenduo Wang (cherry picked from commit 3aba0bb5c5f3a7cd8713ad9dd49bfd4f1ce58206) --- opal/mca/common/ofi/common_ofi.c | 12 +++-- opal/mca/common/ofi/common_ofi.h | 84 ++++++++++++++++---------------- 2 files changed, 50 insertions(+), 46 deletions(-) diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 3b25fee02c6..985e2fd51e4 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -670,10 +670,10 @@ static int get_provider_distance(struct fi_info *provider, hwloc_topology_t topo /** * @brief Get the nearest device to the current thread * - * Use the PMIx server or calculate the device distances, then out of the set of - * returned distances find the subset of the nearest devices. This can be - * 0 or more. - * If there are multiple equidistant devices, break the tie using the rank. + * Compute the distances from the current thread to each NIC in provider_list, + * and select the NIC with the shortest distance. + * If there are multiple equidistant devices, break the tie using local rank + * to balance NIC utilization. * * @param[in] topoloy hwloc topology * @param[in] provider_list List of providers to select from @@ -936,6 +936,10 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, package_rank = get_package_rank(process_info); #if OPAL_OFI_PCI_DATA_AVAILABLE + /** + * If provider PCI BDF information is available, we calculate its physical distance + * to the current process, and select the provider with the shortest distance. + */ ret = get_nearest_nic(opal_hwloc_topology, provider_list, num_providers, package_rank, &provider); if (OPAL_SUCCESS == ret) { diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 0bf114f5907..7118f6a3a01 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -135,47 +135,47 @@ OPAL_DECLSPEC int opal_common_ofi_providers_subset_of_list(struct fi_info *provi /** * Selects NIC (provider) based on hardware locality * - * In multi-nic situations, use hardware topology to pick the "best" - * of the selected NICs. - * There are 3 main cases that this covers: - * - * 1. If the first provider passed into this function is the only valid - * provider, this provider is returned. - * - * 2. If there is more than 1 provider that matches the type of the first - * provider in the list, and the BDF data - * is available then a provider is selected based on locality of device - * cpuset and process cpuset and tries to ensure that processes - * are distributed evenly across NICs. This has two separate - * cases: - * - * i. There is one or more provider local to the process: - * - * (local rank % number of providers of the same type - * that share the process cpuset) is used to select one - * of these providers. - * - * ii. There is no provider that is local to the process: - * - * (local rank % number of providers of the same type) - * is used to select one of these providers - * - * 3. If there is more than 1 providers of the same type in the - * list, and the BDF data is not available (the ofi version does - * not support fi_info.nic or the provider does not support BDF) - * then (local rank % number of providers of the same type) is - * used to select one of these providers - * - * @param provider_list (IN) struct fi_info* An initially selected - * provider NIC. The provider name and - * attributes are used to restrict NIC - * selection. This provider is returned if the - * NIC selection fails. - * - * @param provider (OUT) struct fi_info* object with the selected - * provider if the selection succeeds - * if the selection fails, returns the fi_info - * object that was initially provided. + * The selection is based on the following priority: + * + * Single-NIC: + * + * If only 1 provider is available, always return that provider. + * + * Multi-NIC: + * + * 1. If the process is NOT bound, pick a NIC using (local rank % number + * of providers of the same type). This gives a fair chance to each + * qualified NIC and balances overall utilization. + * + * 2. If the process is bound, we compare providers in the list that have + * the same type as the first provider, and find the provider with the + * shortest distance to the current process. + * + * i. If the provider has PCI BDF data, we attempt to compute the + * distance between the NIC and the current process cpuset. The NIC + * with the shortest distance is returned. + * + * * For equidistant NICs, we select a NIC in round-robin fashion + * using the package rank of the current process, i.e. (package + * rank % number of providers with the same distance). + * + * ii. If we cannot compute the distance between the NIC and the + * current process, e.g. PCI BDF data is not available, a NIC will be + * selected in a round-robin fashion using package rank, i.e. (package + * rank % number of providers of the same type). + * + * @param[in] provider_list struct fi_info* An initially selected + * provider NIC. The provider name and + * attributes are used to restrict NIC + * selection. This provider is returned if the + * NIC selection fails. + * + * @param[in] process_info opal_process_info_t* The current process info + * + * @param[out] provider struct fi_info* object with the selected + * provider if the selection succeeds + * if the selection fails, returns the fi_info + * object that was initially provided. * * All errors should be recoverable and will return the initially provided * provider. However, if an error occurs we can no longer guarantee @@ -184,7 +184,7 @@ OPAL_DECLSPEC int opal_common_ofi_providers_subset_of_list(struct fi_info *provi * */ OPAL_DECLSPEC struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, - opal_process_info_t *process_info); + opal_process_info_t *process_info); /** * Obtain EP endpoint name