Skip to content

Commit

Permalink
test: add E2E test that OS service restarts worker agent when it exits (
Browse files Browse the repository at this point in the history
#439)


* test: add test that verifies worker service restarts after shutdown

Signed-off-by: Yutong Li <[email protected]>
  • Loading branch information
YutongLi291 committed Oct 28, 2024
1 parent 418bf9b commit 7b26b38
Showing 1 changed file with 90 additions and 0 deletions.
90 changes: 90 additions & 0 deletions test/e2e/test_worker_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
This test module contains tests that verify the Worker agent's behavior by starting/stopping the Worker,
and making sure that the status of the Worker is that of what we expect.
"""
from datetime import datetime, timezone
import logging
import os
import pytest
from deadline_test_fixtures import DeadlineClient, EC2InstanceWorker
import pytest
from e2e.utils import is_worker_started, is_worker_stopped
import backoff

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -40,3 +42,91 @@ def test_worker_lifecycle_status_is_expected(
fleet_id=deadline_resources.fleet.id,
worker_id=function_worker.worker_id,
)

@pytest.mark.skipif(
os.environ["OPERATING_SYSTEM"] == "windows",
reason="Linux specific test",
)
def test_linux_worker_restarts_process(
self,
deadline_resources,
deadline_client: DeadlineClient,
function_worker: EC2InstanceWorker,
) -> None:
# Verifies that Linux Worker service restarts the process when we start/stop worker process

assert function_worker.worker_id is not None # This fixes linter type mismatch

assert is_worker_started(
deadline_client=deadline_client,
farm_id=deadline_resources.farm.id,
fleet_id=deadline_resources.fleet.id,
worker_id=function_worker.worker_id,
)

# First check that the worker service is running

@backoff.on_exception(
backoff.constant,
Exception,
max_time=30,
interval=2,
)
def check_service_is_active() -> None:
# The service should be active
service_check_result = function_worker.send_command(
"systemctl is-active deadline-worker"
)
assert (
service_check_result.exit_code == 0
), "Unable to check whether deadline-worker is active"
assert (
"inactive" not in service_check_result.stdout
and "active" in service_check_result.stdout
), f"deadline-worker is in unexpected status {service_check_result.stdout}"

check_service_is_active()

# Check that the worker process is running

def check_worker_processes_exist() -> None:
process_check_result = function_worker.send_command(
f"pgrep --count --full -u {function_worker.configuration.agent_user} deadline-worker-agent"
)

assert (
process_check_result.exit_code == 0
), "deadline-worker-agent process is not running"

check_worker_processes_exist()
time_that_worker_was_killed: datetime = datetime.now(timezone.utc)

# Kill the worker process
pkill_command_result = function_worker.send_command(
f"sudo pkill -9 --full -u {function_worker.configuration.agent_user} deadline-worker-agent"
)
assert (
pkill_command_result.exit_code == 0
), f"Failed to kill the worker agent process: {pkill_command_result}"

# Wait for the process to be restarted by the service

check_service_is_active()

# Check that the service active time is strictly after when we killed the process, since it should have restarted after the kill
service_active_enter_timestamp_result = function_worker.send_command(
"systemctl show --property=ActiveEnterTimestamp deadline-worker"
)
assert service_active_enter_timestamp_result.exit_code == 0

time_service_started: datetime = datetime.strptime(
service_active_enter_timestamp_result.stdout.split("=")[1].strip(),
"%a %Y-%m-%d %H:%M:%S %Z",
).replace(tzinfo=timezone.utc)

assert (
time_service_started > time_that_worker_was_killed
), "Service has not restarted properly as service started before kill command"

# Check that there are worker processes running
check_worker_processes_exist()

0 comments on commit 7b26b38

Please sign in to comment.