From 8e776e15f3d339ea22469c6ae83df48083c31fa9 Mon Sep 17 00:00:00 2001 From: Yutong Li <52769999+YutongLi291@users.noreply.github.com> Date: Tue, 22 Oct 2024 16:54:57 -0700 Subject: [PATCH] test: add E2E test that OS service restarts worker agent when it exits (#439) * test: add test that verifies worker service restarts after shutdown Signed-off-by: Yutong Li <52769999+YutongLi291@users.noreply.github.com> --- test/e2e/test_worker_status.py | 90 ++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/test/e2e/test_worker_status.py b/test/e2e/test_worker_status.py index 18daf8ad..ba5e5d05 100644 --- a/test/e2e/test_worker_status.py +++ b/test/e2e/test_worker_status.py @@ -3,12 +3,14 @@ This test module contains tests that verify the Worker agent's behavior by starting/stopping the Worker, and making sure that the status of the Worker is that of what we expect. """ +from datetime import datetime, timezone import logging import os import pytest from deadline_test_fixtures import DeadlineClient, EC2InstanceWorker import pytest from e2e.utils import is_worker_started, is_worker_stopped +import backoff LOG = logging.getLogger(__name__) @@ -40,3 +42,91 @@ def test_worker_lifecycle_status_is_expected( fleet_id=deadline_resources.fleet.id, worker_id=function_worker.worker_id, ) + + @pytest.mark.skipif( + os.environ["OPERATING_SYSTEM"] == "windows", + reason="Linux specific test", + ) + def test_linux_worker_restarts_process( + self, + deadline_resources, + deadline_client: DeadlineClient, + function_worker: EC2InstanceWorker, + ) -> None: + # Verifies that Linux Worker service restarts the process when we start/stop worker process + + assert function_worker.worker_id is not None # This fixes linter type mismatch + + assert is_worker_started( + deadline_client=deadline_client, + farm_id=deadline_resources.farm.id, + fleet_id=deadline_resources.fleet.id, + worker_id=function_worker.worker_id, + ) + + # First check that the worker service is running + + @backoff.on_exception( + backoff.constant, + Exception, + max_time=30, + interval=2, + ) + def check_service_is_active() -> None: + # The service should be active + service_check_result = function_worker.send_command( + "systemctl is-active deadline-worker" + ) + assert ( + service_check_result.exit_code == 0 + ), "Unable to check whether deadline-worker is active" + assert ( + "inactive" not in service_check_result.stdout + and "active" in service_check_result.stdout + ), f"deadline-worker is in unexpected status {service_check_result.stdout}" + + check_service_is_active() + + # Check that the worker process is running + + def check_worker_processes_exist() -> None: + process_check_result = function_worker.send_command( + f"pgrep --count --full -u {function_worker.configuration.agent_user} deadline-worker-agent" + ) + + assert ( + process_check_result.exit_code == 0 + ), "deadline-worker-agent process is not running" + + check_worker_processes_exist() + time_that_worker_was_killed: datetime = datetime.now(timezone.utc) + + # Kill the worker process + pkill_command_result = function_worker.send_command( + f"sudo pkill -9 --full -u {function_worker.configuration.agent_user} deadline-worker-agent" + ) + assert ( + pkill_command_result.exit_code == 0 + ), f"Failed to kill the worker agent process: {pkill_command_result}" + + # Wait for the process to be restarted by the service + + check_service_is_active() + + # Check that the service active time is strictly after when we killed the process, since it should have restarted after the kill + service_active_enter_timestamp_result = function_worker.send_command( + "systemctl show --property=ActiveEnterTimestamp deadline-worker" + ) + assert service_active_enter_timestamp_result.exit_code == 0 + + time_service_started: datetime = datetime.strptime( + service_active_enter_timestamp_result.stdout.split("=")[1].strip(), + "%a %Y-%m-%d %H:%M:%S %Z", + ).replace(tzinfo=timezone.utc) + + assert ( + time_service_started > time_that_worker_was_killed + ), "Service has not restarted properly as service started before kill command" + + # Check that there are worker processes running + check_worker_processes_exist()