Skip to content

Commit

Permalink
Allow saving SPDK output to a file.
Browse files Browse the repository at this point in the history
Fixes ceph#906

Signed-off-by: Gil Bregman <[email protected]>
  • Loading branch information
gbregman committed Oct 21, 2024
1 parent 567ab1e commit d271cf3
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 13 deletions.
2 changes: 2 additions & 0 deletions ceph-nvmeof.conf
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ tgt_path = /usr/local/bin/nvmf_tgt
timeout = 60.0
#log_level =
#protocol_log_level = WARNING
#log_file_dir =

# Example value: -m 0x3 -L all
# tgt_cmd_extra_args =
Expand All @@ -79,3 +80,4 @@ transport_tcp_options = {"in_capsule_data_size" : 8192, "max_io_qpairs_per_ctrlr

[monitor]
#timeout = 1.0
#log_file_dir =
106 changes: 104 additions & 2 deletions control/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ def __init__(self, config: GatewayConfig):
self.gw_logger_object = GatewayLogger(self.config)
self.logger = self.gw_logger_object.logger
self.spdk_process = None
self.spdk_log_file = None
self.spdk_log_file_path = None
self.gateway_rpc = None
self.server = None
self.discovery_pid = None
Expand All @@ -102,6 +104,8 @@ def __init__(self, config: GatewayConfig):
self.rpc_lock = threading.Lock()
self.group_id = 0
self.monitor_client = '/usr/bin/ceph-nvmeof-monitor-client'
self.monitor_client_log_file = None
self.monitor_client_log_file_path = None
self.omap_state = None
self.omap_lock = None

Expand Down Expand Up @@ -130,6 +134,28 @@ def __exit__(self, exc_type, exc_value, traceback):
if self.spdk_process:
self._stop_spdk()

if self.spdk_log_file:
try:
close(self.spdk_log_file)
except Exception:
pass
self.spdk_log_file = None

if self.spdk_log_file_path:
GatewayLogger.compress_file(self.spdk_log_file_path, f"{self.spdk_log_file_path}.gz")
self.spdk_log_file_path = None

if self.monitor_client_log_file:
try:
close(self.monitor_client_log_file)
except Exception:
pass
self.monitor_client_log_file = None

if self.monitor_client_log_file_path:
GatewayLogger.compress_file(self.monitor_client_log_file_path, f"{self.monitor_client_log_file_path}.gz")
self.monitor_client_log_file_path = None

if self.server:
if logger:
logger.info("Stopping the server...")
Expand Down Expand Up @@ -272,11 +298,26 @@ def _start_monitor_client(self):
"--server-cert", self.config.get("mtls", "server_cert"),
"--client-key", self.config.get("mtls", "client_key"),
"--client-cert", self.config.get("mtls", "client_cert") ]

self.monitor_client_log_file = None
self.monitor_client_log_file_path = None
log_stderr = None
log_file_dir = self.config.get_with_default("monitor", "log_file_dir", None)
self.monitor_client_log_file_path = self.handle_process_output_file(log_file_dir, "monitor-client")
if self.monitor_client_log_file_path:
try:
self.monitor_client_log_file = open(self.monitor_client_log_file_path, "wt")
log_stderr = subprocess.STDOUT
except Exception:
pass

self.logger.info(f"Starting {' '.join(cmd)}")
try:
# start monitor client process
self.monitor_client_process = subprocess.Popen(cmd)
self.monitor_client_process = subprocess.Popen(cmd, stdout=self.monitor_client_log_file, stderr=log_stderr)
self.logger.info(f"monitor client process id: {self.monitor_client_process.pid}")
if self.monitor_client_log_file and self.monitor_client_log_file_path:
self.logger.info(f"Monitor log file is {self.monitor_client_log_file_path}")
# wait for monitor notification of the group id
self._wait_for_group_id()
except Exception:
Expand Down Expand Up @@ -365,6 +406,36 @@ def _grpc_server(self, address):

return server

def handle_process_output_file(self, log_file_dir, prefix):
if not log_file_dir or not log_file_dir.strip():
return None

try:
os.makedirs(log_file_dir, 0o755, True)
except Exception:
return None

if not log_file_dir.endswith("/"):
log_file_dir += "/"
log_file_path = f"{log_file_dir}{prefix}-{self.name}.log"

log_files = [f"{log_file_path}{suffix}" for suffix in ['.bak', '.gz.bak']]
for log_file in log_files:
try:
os.remove(log_file)
except Exception:
pass
try:
os.rename(log_file_path, f"{log_file_path}.bak")
except FileNotFoundError:
pass
try:
os.rename(f"{log_file_path}.gz", f"{log_file_path}.gz.bak")
except FileNotFoundError:
pass

return log_file_path

def _start_spdk(self, omap_state):
"""Starts SPDK process."""

Expand Down Expand Up @@ -403,11 +474,22 @@ def _start_spdk(self, omap_state):
self.logger.info(f"SPDK autodetecting cpu_mask: {cpu_mask}")
cmd += shlex.split(cpu_mask)

self.spdk_log_file = None
self.spdk_log_file_path = None
log_stderr = None
log_file_dir = self.config.get_with_default("spdk", "log_file_dir", None)
self.spdk_log_file_path = self.handle_process_output_file(log_file_dir, "spdk")
if self.spdk_log_file_path:
try:
self.spdk_log_file = open(self.spdk_log_file_path, "wt")
log_stderr = subprocess.STDOUT
except Exception:
pass
self.logger.info(f"Starting {' '.join(cmd)}")
try:
# start spdk process
time.sleep(2) # this is a temporary hack, we have a timing issue here. Once we solve it the sleep will ve removed
self.spdk_process = subprocess.Popen(cmd)
self.spdk_process = subprocess.Popen(cmd, stdout=self.spdk_log_file, stderr=log_stderr)
except Exception:
self.logger.exception(f"Unable to start SPDK")
raise
Expand All @@ -424,6 +506,8 @@ def _start_spdk(self, omap_state):
# connect timeout: spdk client retries 5 times per sec
conn_retries = int(timeout * 5)
self.logger.info(f"SPDK process id: {self.spdk_process.pid}")
if self.spdk_log_file and self.spdk_log_file_path:
self.logger.info(f"SPDK log file is {self.spdk_log_file_path}")
self.logger.info(
f"Attempting to initialize SPDK: rpc_socket: {self.spdk_rpc_socket_path},"
f" conn_retries: {conn_retries}, timeout: {timeout}, log level: {protocol_log_level}"
Expand Down Expand Up @@ -497,13 +581,31 @@ def _stop_monitor_client(self):
timeout = self.config.getfloat_with_default("monitor", "timeout", 1.0)
self._stop_subprocess(self.monitor_client_process, timeout)
self.monitor_client_process = None
if self.monitor_client_log_file:
try:
close(self.monitor_client_log_file)
except Exception:
pass
self.monitor_client_log_file = None
if self.monitor_client_log_file_path:
GatewayLogger.compress_file(self.monitor_client_log_file_path, f"{self.monitor_client_log_file_path}.gz")
self.monitor_client_log_file_path = None

def _stop_spdk(self):
"""Stops SPDK process."""
# Terminate spdk process
timeout = self.config.getfloat_with_default("spdk", "timeout", 60.0)
self._stop_subprocess(self.spdk_process, timeout)
self.spdk_process = None
if self.spdk_log_file:
try:
close(self.spdk_log_file)
except Exception:
pass
self.spdk_log_file = None
if self.spdk_log_file_path:
GatewayLogger.compress_file(self.spdk_log_file_path, f"{self.spdk_log_file_path}.gz")
self.spdk_log_file_path = None

# Clean spdk rpc socket
if self.spdk_rpc_socket_path and os.path.exists(self.spdk_rpc_socket_path):
Expand Down
2 changes: 1 addition & 1 deletion control/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def __init__(self, config=None):
max_log_file_size = 0
max_log_files_count = 0
try:
os.makedirs(self.log_directory, 0o777, True)
os.makedirs(self.log_directory, 0o755, True)
logdir_ok = True
self.handler = logging.handlers.RotatingFileHandler(self.log_directory + "/" + GatewayLogger.NVME_LOG_FILE_NAME,
maxBytes = max_log_file_size * 1024 * 1024,
Expand Down
6 changes: 5 additions & 1 deletion tests/ceph-nvmeof.tls.conf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ enable_spdk_discovery_controller = False
#verify_nqns = True
#allowed_consecutive_spdk_ping_failures = 1
#spdk_ping_interval_in_seconds = 2.0
#max_hosts_per_namespace = 1
#max_namespaces_with_netmask = 1000

[gateway-logs]
log_level=debug
Expand Down Expand Up @@ -63,7 +65,9 @@ tgt_path = /usr/local/bin/nvmf_tgt
#rpc_socket_name = spdk.sock
#tgt_cmd_extra_args = --env-context="--no-huge -m1024" --iova-mode=va
timeout = 60.0
#log_level = WARNING
#log_level =
#protocol_log_level = WARNING
#log_file_dir = /var/log/ceph

# Example value: -m 0x3 -L all
# tgt_cmd_extra_args =
Expand Down
33 changes: 24 additions & 9 deletions tests/test_log_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import shutil
import stat
import gzip
import glob

config = "ceph-nvmeof.conf"
subsystem_prefix = "nqn.2016-06.io.spdk:cnode"
Expand All @@ -36,13 +37,16 @@ def gateway(config, request):
config.config["gateway-logs"]["max_log_file_size_in_mb"] = "10"
config.config["gateway-logs"]["log_files_rotation_enabled"] = "True"
config.config["gateway"]["name"] = request.node.name.replace("_", "-")
config.config["spdk"]["log_file_dir"] = "/var/log/ceph"
if request.node.name == "test_log_files_disabled":
config.config["gateway-logs"]["log_files_enabled"] = "False"
elif request.node.name == "test_log_files_rotation":
config.config["gateway-logs"]["max_log_file_size_in_mb"] = "1"
elif request.node.name == "test_log_files_disable_rotation":
config.config["gateway-logs"]["max_log_file_size_in_mb"] = "1"
config.config["gateway-logs"]["log_files_rotation_enabled"] = "False"
elif request.node.name == "test_no_spdk_log":
config.config["spdk"]["log_file_dir"] = ""

with GatewayServer(config) as gateway:

Expand Down Expand Up @@ -73,6 +77,9 @@ def test_log_files(gateway):
assert "nvmeof-log.gz" in files or "nvmeof-log.0" in files
with open(f"/var/log/ceph/nvmeof-{gw.name}/nvmeof-log", "r") as f:
assert f"Starting gateway {gw.name}" in f.read()
spdk_files = glob.glob("/var/log/ceph/spdk*")
assert len(spdk_files) == 1
assert f"spdk-{gw.name}" in spdk_files[0]

def test_log_files_disabled(gateway):
gw = gateway
Expand All @@ -82,22 +89,22 @@ def test_log_files_disabled(gateway):
assert subs_list.status == 0
assert len(subs_list.subsystems) == 1
assert subs_list.subsystems[0].nqn == subsystem_prefix + "1"
files = os.listdir("/var/log/ceph")
files = glob.glob("/var/log/ceph/nvme*")
assert files == []

def test_log_files_rotation(gateway):
gw = gateway
files = os.listdir("/var/log/ceph")
files = glob.glob("/var/log/ceph/nvme*")
assert len(files) == 1
assert files[0] == f"nvmeof-{gw.name}"
assert f"nvmeof-{gw.name}" in files[0]
cli(["subsystem", "add", "--subsystem", subsystem_prefix + "2"])
cli(["subsystem", "add", "--subsystem", subsystem_prefix + "3"])
cli(["subsystem", "add", "--subsystem", subsystem_prefix + "4"])
for i in range(2000):
cli(["subsystem", "list"])
files = os.listdir("/var/log/ceph")
files = glob.glob("/var/log/ceph/nvme*")
assert len(files) == 1
assert files[0] == f"nvmeof-{gw.name}"
assert f"nvmeof-{gw.name}" in files[0]
files = os.listdir(f"/var/log/ceph/nvmeof-{gw.name}")
assert len(files) > 1
assert "nvmeof-log.1" in files
Expand All @@ -119,17 +126,17 @@ def test_log_files_rotation(gateway):

def test_log_files_disable_rotation(gateway):
gw = gateway
files = os.listdir("/var/log/ceph")
files = glob.glob("/var/log/ceph/nvme*")
assert len(files) == 1
assert files[0] == f"nvmeof-{gw.name}"
assert f"nvmeof-{gw.name}" in files[0]
cli(["subsystem", "add", "--subsystem", subsystem_prefix + "5"])
cli(["subsystem", "add", "--subsystem", subsystem_prefix + "6"])
cli(["subsystem", "add", "--subsystem", subsystem_prefix + "7"])
for i in range(2000):
cli(["subsystem", "list"])
files = os.listdir("/var/log/ceph")
files = glob.glob("/var/log/ceph/nvme*")
assert len(files) == 1
assert files[0] == f"nvmeof-{gw.name}"
assert f"nvmeof-{gw.name}" in files[0]
files = os.listdir(f"/var/log/ceph/nvmeof-{gw.name}")
assert len(files) == 1
assert files[0] == "nvmeof-log"
Expand All @@ -138,3 +145,11 @@ def test_log_files_disable_rotation(gateway):
with open(f"/var/log/ceph/nvmeof-{gw.name}/nvmeof-log", mode="r") as f:
check_for = f"Starting gateway {gw.name}"
assert check_for in f.read()

def test_no_spdk_log(gateway):
gw = gateway
files = glob.glob("/var/log/ceph/nvme*")
assert len(files) == 1
assert f"nvmeof-{gw.name}" in files[0]
spdk_files = glob.glob("/var/log/ceph/spdk*")
assert len(spdk_files) == 0

0 comments on commit d271cf3

Please sign in to comment.