From 3aba0bb5c5f3a7cd8713ad9dd49bfd4f1ce58206 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Fri, 1 Dec 2023 16:38:47 -0800 Subject: [PATCH] opal/ofi: update nic selection function doc The documentation needs an update to reflect latest implementation. The original cpuset matching logic has been replaced with a new distance calculation algorithm. This change also clarifies the round-robin selection process when we need to break a tie. Signed-off-by: Wenduo Wang --- opal/mca/common/ofi/common_ofi.c | 12 +++-- opal/mca/common/ofi/common_ofi.h | 84 ++++++++++++++++---------------- 2 files changed, 50 insertions(+), 46 deletions(-) diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 098e54503e1..c922ea218d4 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -623,10 +623,10 @@ static int get_provider_distance(struct fi_info *provider, hwloc_topology_t topo /** * @brief Get the nearest device to the current thread * - * Use the PMIx server or calculate the device distances, then out of the set of - * returned distances find the subset of the nearest devices. This can be - * 0 or more. - * If there are multiple equidistant devices, break the tie using the rank. + * Compute the distances from the current thread to each NIC in provider_list, + * and select the NIC with the shortest distance. + * If there are multiple equidistant devices, break the tie using local rank + * to balance NIC utilization. * * @param[in] topoloy hwloc topology * @param[in] provider_list List of providers to select from @@ -898,6 +898,10 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, package_rank = get_package_rank(process_info); #if OPAL_OFI_PCI_DATA_AVAILABLE + /** + * If provider PCI BDF information is available, we calculate its physical distance + * to the current process, and select the provider with the shortest distance. + */ ret = get_nearest_nic(opal_hwloc_topology, provider_list, num_providers, package_rank, &provider); if (OPAL_SUCCESS == ret) { diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 10992410019..edc5be8a1fe 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -103,47 +103,47 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); /** * Selects NIC (provider) based on hardware locality * - * In multi-nic situations, use hardware topology to pick the "best" - * of the selected NICs. - * There are 3 main cases that this covers: - * - * 1. If the first provider passed into this function is the only valid - * provider, this provider is returned. - * - * 2. If there is more than 1 provider that matches the type of the first - * provider in the list, and the BDF data - * is available then a provider is selected based on locality of device - * cpuset and process cpuset and tries to ensure that processes - * are distributed evenly across NICs. This has two separate - * cases: - * - * i. There is one or more provider local to the process: - * - * (local rank % number of providers of the same type - * that share the process cpuset) is used to select one - * of these providers. - * - * ii. There is no provider that is local to the process: - * - * (local rank % number of providers of the same type) - * is used to select one of these providers - * - * 3. If there is more than 1 providers of the same type in the - * list, and the BDF data is not available (the ofi version does - * not support fi_info.nic or the provider does not support BDF) - * then (local rank % number of providers of the same type) is - * used to select one of these providers - * - * @param provider_list (IN) struct fi_info* An initially selected - * provider NIC. The provider name and - * attributes are used to restrict NIC - * selection. This provider is returned if the - * NIC selection fails. - * - * @param provider (OUT) struct fi_info* object with the selected - * provider if the selection succeeds - * if the selection fails, returns the fi_info - * object that was initially provided. + * The selection is based on the following priority: + * + * Single-NIC: + * + * If only 1 provider is available, always return that provider. + * + * Multi-NIC: + * + * 1. If the process is NOT bound, pick a NIC using (local rank % number + * of providers of the same type). This gives a fair chance to each + * qualified NIC and balances overall utilization. + * + * 2. If the process is bound, we compare providers in the list that have + * the same type as the first provider, and find the provider with the + * shortest distance to the current process. + * + * i. If the provider has PCI BDF data, we attempt to compute the + * distance between the NIC and the current process cpuset. The NIC + * with the shortest distance is returned. + * + * * For equidistant NICs, we select a NIC in round-robin fashion + * using the package rank of the current process, i.e. (package + * rank % number of providers with the same distance). + * + * ii. If we cannot compute the distance between the NIC and the + * current process, e.g. PCI BDF data is not available, a NIC will be + * selected in a round-robin fashion using package rank, i.e. (package + * rank % number of providers of the same type). + * + * @param[in] provider_list struct fi_info* An initially selected + * provider NIC. The provider name and + * attributes are used to restrict NIC + * selection. This provider is returned if the + * NIC selection fails. + * + * @param[in] process_info opal_process_info_t* The current process info + * + * @param[out] provider struct fi_info* object with the selected + * provider if the selection succeeds + * if the selection fails, returns the fi_info + * object that was initially provided. * * All errors should be recoverable and will return the initially provided * provider. However, if an error occurs we can no longer guarantee @@ -152,7 +152,7 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); * */ OPAL_DECLSPEC struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, - opal_process_info_t *process_info); + opal_process_info_t *process_info); /** * Obtain EP endpoint name