From 48551c00c26ae4249644761ca51bbed64166ea8f Mon Sep 17 00:00:00 2001 From: Alexander Indenbaum Date: Mon, 14 Oct 2024 10:23:08 +0000 Subject: [PATCH 1/2] spdk: do not use huge pages Signed-off-by: Alexander Indenbaum --- control/grpc.py | 38 ----------------------------------- control/server.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 38 deletions(-) diff --git a/control/grpc.py b/control/grpc.py index e0cebcfe..30ad6c8d 100644 --- a/control/grpc.py +++ b/control/grpc.py @@ -302,44 +302,6 @@ def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, rp config.display_environment_info(self.logger) self.ceph_utils = ceph_utils self.ceph_utils.fetch_and_display_ceph_version() - requested_hugepages_val = os.getenv("HUGEPAGES", "") - if not requested_hugepages_val: - self.logger.warning("Can't get requested huge pages count") - else: - requested_hugepages_val = requested_hugepages_val.strip() - try: - requested_hugepages_val = int(requested_hugepages_val) - self.logger.info(f"Requested huge pages count is {requested_hugepages_val}") - except ValueError: - self.logger.warning(f"Requested huge pages count value {requested_hugepages_val} is not numeric") - requested_hugepages_val = None - hugepages_file = os.getenv("HUGEPAGES_DIR", "") - if not hugepages_file: - hugepages_file = "/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages" - self.logger.warning("No huge pages file defined, will use /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages") - else: - hugepages_file = hugepages_file.strip() - if os.access(hugepages_file, os.F_OK): - try: - hugepages_val = "" - with open(hugepages_file) as f: - hugepages_val = f.readline() - hugepages_val = hugepages_val.strip() - if hugepages_val: - try: - hugepages_val = int(hugepages_val) - self.logger.info(f"Actual huge pages count is {hugepages_val}") - except ValueError: - self.logger.warning(f"Actual huge pages count value {hugepages_val} is not numeric") - hugepages_val = "" - if requested_hugepages_val and hugepages_val != "" and requested_hugepages_val > hugepages_val: - self.logger.warning(f"The actual huge page count {hugepages_val} is smaller than the requested value of {requested_hugepages_val}") - else: - self.logger.warning(f"Can't read actual huge pages count value from {hugepages_file}") - except Exception as ex: - self.logger.exception(f"Can't read actual huge pages count value from {hugepages_file}") - else: - self.logger.warning(f"Can't find huge pages file {hugepages_file}") self.config = config config.dump_config_file(self.logger) self.rpc_lock = rpc_lock diff --git a/control/server.py b/control/server.py index 16d05b9c..51bfa69c 100644 --- a/control/server.py +++ b/control/server.py @@ -481,6 +481,16 @@ def _start_spdk(self, omap_state): if spdk_tgt_cmd_extra_args: cmd += shlex.split(spdk_tgt_cmd_extra_args) + # No huge pages configuration controlled by spdk.mem_size conf option + spdk_memsize = self.config.getint_with_default("spdk", "mem_size", None) + if spdk_memsize: + self.logger.info(f"SPDK will not use huge pages, mem size: {spdk_memsize}") + cmd += ["--no-huge", "-s", str(spdk_memsize)] + else: + self.logger.info(f"SPDK will use huge pages, probing...") + self.probe_huge_pages() + + # If not provided in configuration, # calculate cpu mask available for spdk reactors if not cpumask_set(cmd): @@ -707,6 +717,47 @@ def _ping(self): self.logger.exception(f"spdk_get_version failed") return False + def probe_huge_pages(self): + """Probe kernel's huge pages confiuguration""" + requested_hugepages_val = os.getenv("HUGEPAGES", "") + if not requested_hugepages_val: + self.logger.warning("Can't get requested huge pages count") + else: + requested_hugepages_val = requested_hugepages_val.strip() + try: + requested_hugepages_val = int(requested_hugepages_val) + self.logger.info(f"Requested huge pages count is {requested_hugepages_val}") + except ValueError: + self.logger.warning(f"Requested huge pages count value {requested_hugepages_val} is not numeric") + requested_hugepages_val = None + hugepages_file = os.getenv("HUGEPAGES_DIR", "") + if not hugepages_file: + hugepages_file = "/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages" + self.logger.warning("No huge pages file defined, will use /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages") + else: + hugepages_file = hugepages_file.strip() + if os.access(hugepages_file, os.F_OK): + try: + hugepages_val = "" + with open(hugepages_file) as f: + hugepages_val = f.readline() + hugepages_val = hugepages_val.strip() + if hugepages_val: + try: + hugepages_val = int(hugepages_val) + self.logger.info(f"Actual huge pages count is {hugepages_val}") + except ValueError: + self.logger.warning(f"Actual huge pages count value {hugepages_val} is not numeric") + hugepages_val = "" + if requested_hugepages_val and hugepages_val != "" and requested_hugepages_val > hugepages_val: + self.logger.warning(f"The actual huge page count {hugepages_val} is smaller than the requested value of {requested_hugepages_val}") + else: + self.logger.warning(f"Can't read actual huge pages count value from {hugepages_file}") + except Exception as ex: + self.logger.exception(f"Can't read actual huge pages count value from {hugepages_file}") + else: + self.logger.warning(f"Can't find huge pages file {hugepages_file}") + def gateway_rpc_caller(self, requests, is_add_req): """Passes RPC requests to gateway service.""" for key, val in requests.items(): From aa6b0df39da6b422d5c214670166b02fcb76200c Mon Sep 17 00:00:00 2001 From: Alexander Indenbaum Date: Mon, 18 Nov 2024 09:31:26 +0000 Subject: [PATCH 2/2] Add no_huge ha test Signed-off-by: Alexander Indenbaum --- .github/workflows/build-container.yml | 2 +- tests/ceph-nvmeof.no-huge.conf | 87 +++++++++++++++++++++++++++ tests/ha/no_huge.sh | 1 + tests/ha/start_up_no_huge.sh | 13 ++++ 4 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 tests/ceph-nvmeof.no-huge.conf create mode 120000 tests/ha/no_huge.sh create mode 100755 tests/ha/start_up_no_huge.sh diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml index 3b993649..f938320c 100644 --- a/.github/workflows/build-container.yml +++ b/.github/workflows/build-container.yml @@ -538,7 +538,7 @@ jobs: strategy: fail-fast: false matrix: - test: ["sanity", "ns_lb_change", "no_subsystems", "state_transitions", "state_transitions_both_gws", "state_transitions_loop", "state_transitions_rand_loop", "late_registration", "late_registration_loop", "4gws", "4gws_loop", "4gws_create_delete", "4gws_create_delete_loop", "namespaces", "namespaces_loop", "mtls", "notify", "ceph_status", "blocklist", "main_exit"] + test: ["sanity", "no_huge", "ns_lb_change", "no_subsystems", "state_transitions", "state_transitions_both_gws", "state_transitions_loop", "state_transitions_rand_loop", "late_registration", "late_registration_loop", "4gws", "4gws_loop", "4gws_create_delete", "4gws_create_delete_loop", "namespaces", "namespaces_loop", "mtls", "notify", "ceph_status", "blocklist", "main_exit"] runs-on: ubuntu-latest env: HUGEPAGES: 1024 # 4 spdk instances diff --git a/tests/ceph-nvmeof.no-huge.conf b/tests/ceph-nvmeof.no-huge.conf new file mode 100644 index 00000000..d5253720 --- /dev/null +++ b/tests/ceph-nvmeof.no-huge.conf @@ -0,0 +1,87 @@ +# +# Copyright (c) 2021 International Business Machines +# All rights reserved. +# +# SPDX-License-Identifier: LGPL-3.0-or-later +# +# Authors: anita.shekar@ibm.com, sandy.kaur@ibm.com +# + +[gateway] +name = +group = +addr = 0.0.0.0 +port = 5500 +enable_auth = False +state_update_notify = True +state_update_timeout_in_msec = 2000 +state_update_interval_sec = 5 +enable_spdk_discovery_controller = False +#omap_file_lock_duration = 20 +#omap_file_lock_retries = 30 +#omap_file_lock_retry_sleep_interval = 1.0 +#omap_file_update_reloads = 10 +#enable_prometheus_exporter = True +#prometheus_exporter_ssl = True +#prometheus_port = 10008 +#prometheus_bdev_pools = rbd +#prometheus_stats_interval = 10 +#verify_nqns = True +#allowed_consecutive_spdk_ping_failures = 1 +#spdk_ping_interval_in_seconds = 2.0 +#max_hosts_per_namespace = 1 +#max_namespaces_with_netmask = 1000 +#max_subsystems = 128 +#max_namespaces = 256 +#max_hosts_per_subsystem = 32 + +[gateway-logs] +log_level=debug +#log_files_enabled = True +#log_files_rotation_enabled = True +#verbose_log_messages = True +#max_log_file_size_in_mb=10 +#max_log_files_count=20 +#max_log_directory_backups=10 +# +# Notice that if you change the log directory the log files will only be visible inside the container +# +#log_directory = /var/log/ceph/ + +[discovery] +addr = 0.0.0.0 +port = 8009 + +[ceph] +pool = rbd +config_file = /etc/ceph/ceph.conf + +[mtls] +server_key = ./server.key +client_key = ./client.key +server_cert = ./server.crt +client_cert = ./client.crt + +[spdk] +bdevs_per_cluster = 32 +mem_size=4096 +tgt_path = /usr/local/bin/nvmf_tgt +#rpc_socket_dir = /var/tmp/ +#rpc_socket_name = spdk.sock +#tgt_cmd_extra_args = --env-context="--no-huge -m1024" --iova-mode=va +timeout = 60.0 +#log_level = +#protocol_log_level = WARNING +#log_file_dir = + +# Example value: -m 0x3 -L all +# tgt_cmd_extra_args = + +# transports = tcp + +# Example value: {"max_queue_depth" : 16, "max_io_size" : 4194304, "io_unit_size" : 1048576, "zcopy" : false} +transport_tcp_options = {"in_capsule_data_size" : 8192, "max_io_qpairs_per_ctrlr" : 7} + +[monitor] +#timeout = 1.0 +#log_file_dir = diff --git a/tests/ha/no_huge.sh b/tests/ha/no_huge.sh new file mode 120000 index 00000000..d4dc3848 --- /dev/null +++ b/tests/ha/no_huge.sh @@ -0,0 +1 @@ +sanity.sh \ No newline at end of file diff --git a/tests/ha/start_up_no_huge.sh b/tests/ha/start_up_no_huge.sh new file mode 100755 index 00000000..da44e47b --- /dev/null +++ b/tests/ha/start_up_no_huge.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +set -ex + +# Check if GITHUB_WORKSPACE is defined +if [ -n "$GITHUB_WORKSPACE" ]; then + test_dir="$GITHUB_WORKSPACE/tests/ha" +else + test_dir=$(dirname $0) +fi + +export NVMEOF_CONFIG=./tests/ceph-nvmeof.no-huge.conf +$test_dir/start_up.sh