diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 1957bc80a67..f0da0f4a52c 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -440,12 +440,24 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr && !check_ep_attr(provider_info->ep_attr, provider->ep_attr) && !(provider_info->caps & ~(provider->caps)) && !(provider_info->mode & ~(provider->mode)) && provider_info->addr_format == provider->addr_format) { - return 0; + return OPAL_SUCCESS; } else { return OPAL_ERROR; } } +#if OPAL_OFI_PCI_DATA_AVAILABLE +static int get_provider_nic_pci(struct fi_info *provider, struct fi_pci_attr *pci) +{ + if (NULL != provider->nic && NULL != provider->nic->bus_attr + && FI_BUS_PCI == provider->nic->bus_attr->bus_type) { + *pci = provider->nic->bus_attr->attr.pci; + return OPAL_SUCCESS; + } + return OPAL_ERR_NOT_AVAILABLE; +} +#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ + /** * Calculate device distances * @@ -510,177 +522,220 @@ static int compute_dev_distances(pmix_device_distance_t **distances, } /** - * Find the nearest devices to the current thread + * @brief Get the provider distance from the provided distance metrics + * + * @param[in] topology hwloc topology + * @param[in] provider Provider object + * @param[in] distances List of known device distances + * @param[in] num_distances Length of distances + * @param[out] distance Pointer to store the provider distance + * @return OPAL_SUCCESS if and only if the distance is found in the provided list + */ +#if OPAL_OFI_PCI_DATA_AVAILABLE +static int get_provider_distance(hwloc_topology_t topology, struct fi_info *provider, + pmix_device_distance_t *distances, int num_distances, + uint16_t *distance) +{ + hwloc_obj_t pcidev, osdev; + struct fi_pci_attr pci = {0}; + + if (OPAL_SUCCESS != get_provider_nic_pci(provider, &pci)) { + opal_output_verbose(1, opal_common_ofi.output, "Cannot determine PCI attributes of provider %s", + provider->domain_attr->name); + return OPAL_ERROR; + } + + pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id, + pci.function_id); + if (!pcidev) { + opal_output_verbose(1, opal_common_ofi.output, "Cannot locate PCI device of provider %s", + provider->domain_attr->name); + return OPAL_ERROR; + } + +#if HWLOC_API_VERSION < 0x00020000 + osdev = pcidev->first_child; +#else + osdev = pcidev->io_first_child; +#endif /* HWLOC_API_VERSION */ + for (; osdev != NULL; osdev = osdev->next_sibling) { + int i; + + if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { + const char *nguid = hwloc_obj_get_info_by_name(osdev, "NodeGUID"); + const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID"); + + if (!nguid && !sguid) + continue; + + for (i = 0; i < num_distances; i++) { + char lsguid[20], lnguid[20]; + int ret; + + if (PMIX_DEVTYPE_OPENFABRICS != distances[i].type) { + continue; + } + + if (!distances[i].osname || !osdev->name + || strcmp(distances[i].osname, osdev->name)) + continue; + + ret = sscanf(distances[i].uuid, "fab://%19s::%19s", lnguid, lsguid); + if (ret != 2) + continue; + + if ((nguid && (0 == strcasecmp(lnguid, nguid))) + || (sguid && (0 == strcasecmp(lsguid, sguid)))) { + *distance = distances[i].mindist; + return OPAL_SUCCESS; + } + } + } else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) { + const char *address = hwloc_obj_get_info_by_name(osdev, "Address"); + if (!address) + continue; + for (i = 0; i < num_distances; i++) { + if (PMIX_DEVTYPE_NETWORK != distances[i].type) { + continue; + } + char *addr = strstr(distances[i].uuid, "://"); + if (!addr || addr + 3 > distances[i].uuid + strlen(distances[i].uuid)) + continue; + if (!strcmp(addr + 3, address)) { + *distance = distances[i].mindist; + return OPAL_SUCCESS; + } + } + } + } + + return OPAL_ERROR; +} +#else +static int get_provider_distance(struct fi_info *provider, hwloc_topology_t topology, + pmix_device_distance_t *distances, size_t num_distances, + uint16_t *distance) +{ + return OPAL_ERROR; +} +#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ + +/** + * @brief Get the nearest device to the current thread * * Use the PMIx server or calculate the device distances, then out of the set of * returned distances find the subset of the nearest devices. This can be - * 1 or more. - * - * @param num_distances (OUT) number of entries in the returned array + * 0 or more. + * If there are multiple equidistant devices, break the tie using the rank. * - * @return An array of device distances which are nearest this thread - * or NULL if we fail to get the distances. In this case we will just - * revert to round robin. + * @param[in] topoloy hwloc topology + * @param[in] provider_list List of providers to select from + * @param[in] num_providers Number of providers in provider_list + * @param[in] rank local rank of the process + * @param[out] provider pointer to the selected provider * + * @return OPAL_SUCCESS if and only if a nearest provider is found. */ -static pmix_device_distance_t * -get_nearest_nics(int *num_distances, pmix_value_t **valin) +static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_list, + size_t num_providers, uint32_t rank, struct fi_info **provider) { - size_t ndist, i; - int ret, idx = 0; + int ret; pmix_data_array_t *dptr; - uint16_t near = USHRT_MAX; + pmix_device_distance_t *distances; pmix_info_t directive; pmix_value_t *val = NULL; - pmix_device_distance_t *distances, *nearest = NULL; + size_t ndist, num_nearest = 0; + struct fi_info *current_provider = NULL; + uint16_t dists[num_providers], *dist = NULL, min_dist = USHRT_MAX; + uint32_t provider_rank = 0; PMIx_Info_load(&directive, PMIX_OPTIONAL, NULL, PMIX_BOOL); - ret = PMIx_Get(&opal_process_info.myprocid, - PMIX_DEVICE_DISTANCES, &directive, 1, &val); + ret = PMIx_Get(&opal_process_info.myprocid, PMIX_DEVICE_DISTANCES, &directive, 1, &val); PMIx_Info_destruct(&directive); if (ret != PMIX_SUCCESS || !val) { ret = compute_dev_distances(&distances, &ndist); if (ret) { + ret = OPAL_ERROR; goto out; } goto find_nearest; } if (PMIX_DATA_ARRAY != val->type) { + ret = OPAL_ERROR; goto out; } dptr = val->data.darray; if (NULL == dptr) { + ret = OPAL_ERROR; goto out; } if (PMIX_DEVICE_DIST != dptr->type) { + ret = OPAL_ERROR; goto out; } - distances = (pmix_device_distance_t*)dptr->array; + distances = (pmix_device_distance_t *) dptr->array; ndist = dptr->size; find_nearest: - nearest = calloc(sizeof(*distances), ndist); - if (!nearest) { - goto out; - } - - for (i = 0; i < ndist; i++) { - if (distances[i].type != PMIX_DEVTYPE_NETWORK && - distances[i].type != PMIX_DEVTYPE_OPENFABRICS) + for (current_provider = provider_list, dist = dists; NULL != current_provider; + current_provider = current_provider->next, ++dist) { + if (OPAL_SUCCESS != check_provider_attr(provider_list, current_provider)) { continue; - if (distances[i].mindist < near) { - idx = 0; - near = distances[i].mindist; - nearest[idx] = distances[i]; - idx++; - } else if (distances[i].mindist == near) { - nearest[idx] = distances[i]; - idx++; + } + if (OPAL_SUCCESS != get_provider_distance(topology, current_provider, distances, ndist, dist)) { + *dist = USHRT_MAX; + } + + if (*dist < min_dist) { + min_dist = *dist; + num_nearest = 1; + } else if (*dist == min_dist) { + ++num_nearest; + } + + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)) { + opal_output_verbose(1, opal_common_ofi.output, "provider: %s dist: %d", + current_provider->domain_attr->name, *dist); } } - *num_distances = idx; + ret = OPAL_ERROR; + if (0 >= num_nearest) { + return ret; + } + provider_rank = rank % num_nearest; + num_nearest = 0; + for (current_provider = provider_list, dist = dists; NULL != current_provider; + current_provider = current_provider->next) { + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider) + && min_dist == *(dist++) && provider_rank == num_nearest++) { + *provider = current_provider; + ret = OPAL_SUCCESS; + goto out; + } + } out: - *valin = val; - return nearest; -} + if (val) + PMIx_Value_free(val, 1); -#if OPAL_OFI_PCI_DATA_AVAILABLE -/** - * Determine if a device is nearest - * - * Given a device distances array of the nearest pci devices, - * determine if one of these device distances refers to the pci - * device passed in - * - * @param distances (IN) distances array - * @param num_distances (IN) number of entries in the distances array - * @param topology (IN) topology of the node - * @param pci (IN) PCI device being examined - * - * @return true if the PCI device is in the distances array or if the - * distances array is not provided. False otherwise. - * - */ -#if HWLOC_API_VERSION < 0x00020000 -static bool is_near(pmix_device_distance_t *distances, - int num_distances, - hwloc_topology_t topology, - struct fi_pci_attr pci) -{ - return true; + return ret; } -#else -static bool is_near(pmix_device_distance_t *distances, - int num_distances, - hwloc_topology_t topology, - struct fi_pci_attr pci) -{ - hwloc_obj_t pcidev, osdev; - - /* if we failed to find any distances, then we consider all interfaces - * to be of equal distances and let the caller decide how to handle - * them - */ - if (!distances) - return true; - - pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id, - pci.bus_id, pci.device_id, - pci.function_id); - if (!pcidev) - return false; - - for(osdev = pcidev->io_first_child; osdev != NULL; osdev = osdev->next_sibling) { - int i; - - if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { - const char *nguid = hwloc_obj_get_info_by_name(osdev,"NodeGUID"); - const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID"); - - if (!nguid && !sguid) - continue; - - for (i = 0; i < num_distances; i++) { - char lsguid[20], lnguid[20]; - int ret; - if (!distances[i].osname || !osdev->name - || strcmp(distances[i].osname, osdev->name)) - continue; +static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank, + size_t num_providers) +{ + uint32_t provider_rank = rank % num_providers; + struct fi_info *current_provider = provider_list; - ret = sscanf(distances[i].uuid, "fab://%19s::%19s", lnguid, lsguid); - if (ret != 2) - continue; - if (nguid && (0 == strcasecmp(lnguid, nguid))) { - return true; - } else if (sguid && (0 == strcasecmp(lsguid, sguid))) { - return true; - } - } - } else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) { - const char *address = hwloc_obj_get_info_by_name(osdev, "Address"); - if (!address) - continue; - for (i = 0; i < num_distances; i++) { - char *addr = strstr(distances[i].uuid, "://"); - if (!addr || addr + 3 > distances[i].uuid - + strlen(distances[i].uuid)) - continue; - if (!strcmp(addr+3, address)) { - return true; - } - } - } + for (uint32_t i = 0; i < provider_rank; ++i) { + current_provider = current_provider->next; } - return false; + return current_provider; } -#endif -#endif // OPAL_OFI_PCI_DATA_AVAILABLE /* Count providers returns the number of providers present in an fi_info list * @param (IN) provider_list struct fi_info* list of providers available @@ -791,109 +846,41 @@ static uint32_t get_package_rank(opal_process_info_t *process_info) } struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, - opal_process_info_t *process_info) + opal_process_info_t *process_info) { - struct fi_info *provider = provider_list, *current_provider = provider_list; - struct fi_info **provider_table; -#if OPAL_OFI_PCI_DATA_AVAILABLE - pmix_device_distance_t *distances = NULL; - pmix_value_t *pmix_val; - struct fi_pci_attr pci; - int num_distances = 0; -#endif - bool near = false; - int ret; - unsigned int num_provider = 0, provider_limit = 0; - bool provider_found = false; + int ret, num_providers = 0; + struct fi_info *provider = NULL; uint32_t package_rank = 0; + num_providers = count_providers(provider_list); + if (!process_info->proc_is_bound || 2 > num_providers) { + goto round_robin; + } + /* Initialize opal_hwloc_topology if it is not already */ ret = opal_hwloc_base_get_topology(); if (0 > ret) { /* Provider selection can continue but there is no guarantee of locality */ - opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Failed to initialize topology\n", + opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Failed to initialize topology", __FILE__, __LINE__); } - provider_limit = count_providers(provider_list); - - /* Allocate memory for provider table */ - provider_table = calloc(provider_limit, sizeof(struct fi_info *)); - if (NULL == provider_table) { - opal_output_verbose(1, opal_common_ofi.output, - "%s:%d:Failed to allocate memory for provider table\n", __FILE__, - __LINE__); - return provider_list; - } + package_rank = get_package_rank(process_info); #if OPAL_OFI_PCI_DATA_AVAILABLE - /* find all the nearest devices to this thread, then out of these - * determine which device we should bind to. - */ - distances = get_nearest_nics(&num_distances, &pmix_val); -#endif - - current_provider = provider; - - /* Cycle through remaining fi_info objects, looking for alike providers */ - while (NULL != current_provider) { - if (!check_provider_attr(provider, current_provider)) { - near = false; -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (NULL != current_provider->nic - && NULL != current_provider->nic->bus_attr - && current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) { - pci = current_provider->nic->bus_attr->attr.pci; - near = is_near(distances, num_distances, - opal_hwloc_topology, pci); - } -#endif - /* We could have multiple near providers */ - if (near && !provider_found) { - provider_found = true; - num_provider = 0; - } - - /* Add the provider to the provider list if the cpusets match or if - * no other provider was found on the same cpuset as the process. - */ - if (near || !provider_found) { - provider_table[num_provider] = current_provider; - num_provider++; - } - } - current_provider = current_provider->next; - } - - /* Select provider from local rank % number of providers */ - if (num_provider >= 2) { - // If there are multiple NICs "close" to the process, try to calculate package_rank - package_rank = get_package_rank(process_info); - provider = provider_table[package_rank % num_provider]; - } else if (num_provider == 1) { - provider = provider_table[num_provider - 1]; - } - -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (NULL != provider->nic - && NULL != provider->nic->bus_attr - && provider->nic->bus_attr->bus_type == FI_BUS_PCI) { - pci = provider->nic->bus_attr->attr.pci; - near = is_near(distances, num_distances, - opal_hwloc_topology, pci); + ret = get_nearest_nic(opal_hwloc_topology, provider_list, num_providers, package_rank, + &provider); + if (OPAL_SUCCESS == ret) { + goto out; } -#endif +#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ +round_robin: + provider = select_provider_round_robin(provider_list, package_rank, num_providers); +out: #if OPAL_ENABLE_DEBUG - opal_output_verbose(1, opal_common_ofi.output, - "package rank: %d device: %s near: %s\n", package_rank, - provider->domain_attr->name, near ? "true" : "false"); -#endif - - free(provider_table); -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (pmix_val) - PMIx_Value_free(pmix_val, 1); + opal_output_verbose(1, opal_common_ofi.output, "package rank: %d device: %s", package_rank, + provider->domain_attr->name); #endif return provider; }