Skip to content

Commit

Permalink
feat: windows service (#207)
Browse files Browse the repository at this point in the history
Signed-off-by: Josh Usiskin <[email protected]>
  • Loading branch information
jusiskin authored Mar 16, 2024
1 parent f5ee600 commit 1d97970
Show file tree
Hide file tree
Showing 17 changed files with 1,308 additions and 296 deletions.
29 changes: 29 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,13 @@ ignore = [
"F811",
]

[tool.ruff.lint.per-file-ignores]
# We need to use a platform assertion to short-circuit mypy type checking on non-Windows platforms
# https://mypy.readthedocs.io/en/stable/common_issues.html#python-version-and-system-platform-checks
# This causes imports to come after regular Python statements causing flake8 rule E402 to be flagged
"src/deadline_worker_agent/**/*windows*.py" = ["E402"]
"test/**/*windows*.py" = ["E402"]

[tool.ruff.lint.isort]
known-first-party = [
"deadline_worker_agent",
Expand Down Expand Up @@ -153,6 +160,9 @@ omit = [
"*/scheduler/**/*.py",
"*/worker.py",
]
plugins = [
"coverage_conditional_plugin"
]

[tool.coverage.paths]
source = [ "src/" ]
Expand All @@ -161,6 +171,25 @@ source = [ "src/" ]
show_missing = true
fail_under = 78

# https://github.com/wemake-services/coverage-conditional-plugin
[tool.coverage.coverage_conditional_plugin.omit]
"sys_platform != 'win32'" = [
"src/deadline_worker_agent/windows/*.py",
"src/deadline_worker_agent/installer/win_installer.py"
]

[tool.coverage.coverage_conditional_plugin.rules]
# This cannot be empty otherwise coverage-conditional-plugin crashes with:
# AttributeError: 'NoneType' object has no attribute 'items'
#
# =========== WARNING TO REVIEWERS ============
#
# Any rules added here are ran through Python's
# eval() function so watch for code injection
# attacks.
#
# =========== WARNING TO REVIEWERS ============

[tool.semantic_release]
# Can be removed or set to true once we are v1
major_on_zero = false
Expand Down
1 change: 1 addition & 0 deletions requirements-testing.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
coverage[toml] ~= 7.4
coverage-conditional-plugin == 0.9.*
deadline-cloud-test-fixtures == 0.5.*
pytest ~= 8.1
pytest-cov == 4.1.*
Expand Down
13 changes: 8 additions & 5 deletions src/deadline_worker_agent/installer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ def install() -> None:
fleet_id=args.fleet_id,
region=args.region,
worker_agent_program=scripts_path,
no_install_service=not args.install_service,
start=args.service_start,
install_service=args.install_service,
start_service=args.service_start,
confirm=args.confirmed,
allow_shutdown=args.allow_shutdown,
parser=arg_parser,
Expand Down Expand Up @@ -96,8 +96,8 @@ class ParsedCommandLineArguments(Namespace):
fleet_id: str
region: str
user: str
password: Optional[str]
group: Optional[str]
password: Optional[str] = None
group: Optional[str] = None
confirmed: bool
service_start: bool
allow_shutdown: bool
Expand Down Expand Up @@ -184,7 +184,10 @@ def get_argument_parser() -> ArgumentParser: # pragma: no cover
if sys.platform == "win32":
parser.add_argument(
"--password",
help="The password for the AWS Deadline Cloud Worker Agent user. Defaults to generating a password.",
help=(
"The password for the AWS Deadline Cloud Worker Agent user. Defaults to generating a password "
"if the user does not exist or prompting for the password if the user pre-exists."
),
required=False,
default=None,
)
Expand Down
208 changes: 195 additions & 13 deletions src/deadline_worker_agent/installer/win_installer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,29 @@
import shutil
import string
import sys
import typing
from argparse import ArgumentParser
from getpass import getpass
from pathlib import Path

from deadline_worker_agent.file_system_operations import (
_set_windows_permissions,
FileSystemPermissionEnum,
)
from typing import Optional

import deadline.client.config.config_file
import pywintypes
import win32api
import win32net
import win32netcon
import win32security
import win32service
import win32serviceutil
import winerror
from openjd.sessions import BadCredentialsException, WindowsSessionUser
from win32comext.shell import shell

from ..file_system_operations import (
_set_windows_permissions,
FileSystemPermissionEnum,
)
from ..windows.win_service import WorkerAgentWindowsService


# Defaults
DEFAULT_WA_USER = "deadline-worker"
Expand Down Expand Up @@ -172,6 +177,9 @@ def ensure_local_agent_user(username: str, password: str) -> None:
"""
if check_user_existence(username):
logging.info(f"Agent User {username} already exists")
# This is only to verify the credentials. It will raise a BadCredentialsError if the
# credentials cannot be used to logon the user
WindowsSessionUser(user=username, password=password)
else:
logging.info(f"Creating Agent user {username}")
user_info = {
Expand Down Expand Up @@ -254,7 +262,7 @@ def update_config_file(
deadline_config_sub_directory: str,
farm_id: str,
fleet_id: str,
shutdown_on_stop: typing.Optional[bool] = None,
shutdown_on_stop: Optional[bool] = None,
) -> None:
"""
Updates the worker configuration file, creating it from the example if it does not exist.
Expand Down Expand Up @@ -435,18 +443,167 @@ def update_deadline_client_config(
os.environ.update(old_environ)


def _install_service(
*,
agent_user_name: str,
password: str,
) -> None:
"""Installs the Windows Service that hosts the Worker Agent
Parameters
agent_user_name(str): Worker Agent's account username
password(str): The Worker Agent's user account password
"""

# If the username does not contain the domain, then assume the local domain
# https://learn.microsoft.com/en-us/windows/win32/secauthn/user-name-formats
if "\\" not in agent_user_name and "@" not in agent_user_name:
agent_user_name = f".\\{agent_user_name}"

# Determine the Windows Service configuration. This uses the same logic as
# win32serviceutil.HandleCommandLine() so that the service can be debugged
# using:
#
# python -m deadline_worker_agent.windows.win_service debug
service_class_str = win32serviceutil.GetServiceClassString(WorkerAgentWindowsService)
service_name = WorkerAgentWindowsService._svc_name_
service_display_name = WorkerAgentWindowsService._svc_display_name_
service_description = getattr(WorkerAgentWindowsService, "_svc_description_", None)
exe_name = getattr(WorkerAgentWindowsService, "_exe_name_", None)
exe_args = getattr(WorkerAgentWindowsService, "_exe_args_", None)

# Configure the service to start on boot
startup = win32service.SERVICE_AUTO_START

logging.info(f'Configuring Windows Service "{service_display_name}"...')
try:
win32serviceutil.InstallService(
service_class_str,
service_name,
service_display_name,
serviceDeps=None,
startType=startup,
bRunInteractive=None,
userName=agent_user_name,
password=password,
exeName=exe_name,
perfMonIni=None,
perfMonDll=None,
exeArgs=exe_args,
description=service_description,
delayedstart=False,
)
except win32service.error as exc:
if exc.winerror != winerror.ERROR_SERVICE_EXISTS:
raise
logging.info(f'Service "{service_display_name}" already exists, updating instead...')
win32serviceutil.ChangeServiceConfig(
service_class_str,
service_name,
serviceDeps=None,
startType=startup,
bRunInteractive=None,
userName=agent_user_name,
password=password,
exeName=exe_name,
displayName=service_display_name,
perfMonIni=None,
perfMonDll=None,
exeArgs=exe_args,
description=service_description,
delayedstart=False,
)
logging.info(f'Successfully updated Windows Service "{service_display_name}"')
else:
logging.info(f'Successfully created Windows Service "{service_display_name}"')

logging.info(f'Configuring the failure actions of Windows Service "{service_display_name}"...')
configure_service_failure_actions(service_name)
logging.info(
f'Successfully configured the failure actions for Window Service "{service_display_name}"'
)


def configure_service_failure_actions(service_name):
"""Configures the failure actions of the Windows Service.
We use exponential backoff with a base of 2 seconds and doubling each iteration. This grows until
it reaches ~4m 16s and then repeats indefinitely at this interval. The backoff resets if the service
heals and stays alive for 20 minutes.
This uses the ChangeServiceConfig2 win32 API:
https://learn.microsoft.com/en-us/windows/win32/api/winsvc/nf-winsvc-changeserviceconfig2w
Notably, the third parameter of ChangeServiceConfig2 expects a SERVICE_FAILURE_ACTIONSW structure.
whose API reference docs best explains how Windows Service failure actions work:
https://learn.microsoft.com/en-us/windows/win32/api/winsvc/ns-winsvc-service_failure_actionsw#remarks
"""

# pywin32's ChangeServiceConfig2 wrapper accepts tuples ofs: (action type, delay in ms)
# Exponential backoff with base of 2 seconds (2000 ms), doubling each iteration.
# The backoff grows from 2 seconds to ~4m 16s over 8 attempts totalling 510s (or 8m 30s).
actions = [(win32service.SC_ACTION_RESTART, 2000 * 2**i) for i in range(8)]

logging.debug("Opening the Service Control Manager...")
scm = win32service.OpenSCManager(None, None, win32service.SC_MANAGER_ALL_ACCESS)
logging.debug("Successfully opened the Service Control Manager")
try:
logging.debug(f'Opening the Windows Service "{service_name}"')
service = win32service.OpenService(scm, service_name, win32service.SERVICE_ALL_ACCESS)
logging.debug(f'Successfully opened the Windows Service "{service_name}"')

logging.debug(f'Modifying the failure actions of Windows Service "{service_name}...')
try:
win32service.ChangeServiceConfig2(
service,
win32service.SERVICE_CONFIG_FAILURE_ACTIONS,
{
# Repeat the last action (restart with ~4m 16s delay) until the service recovers
# for 20 minutes (in seconds)
"ResetPeriod": 20 * 60,
"RebootMsg": None,
"Command": None,
"Actions": actions,
},
)
logging.debug(
f'Successfully modified the failure actions of Windows Service "{service_name}...'
)
finally:
logging.debug(f'Closing the Windows Service "{service_name}"..')
win32service.CloseServiceHandle(service)
logging.debug(f'Successfully closed the Windows Service "{service_name}"')
finally:
logging.debug("Closing the Service Control Manager...")
win32service.CloseServiceHandle(scm)
logging.debug("Successfully closed the Service Control Manager")


def _start_service() -> None:
"""Starts the Windows Service hosting the Worker Agent"""
service_name = WorkerAgentWindowsService._svc_name_

logging.info(f'Starting service "{service_name}"...')
try:
win32serviceutil.StartService(serviceName=service_name)
except Exception as e:
logging.warning(f'Failed to start service "{service_name}": {e}')
else:
logging.info(f'Successfully started service "{service_name}"')


def start_windows_installer(
farm_id: str,
fleet_id: str,
region: str,
worker_agent_program: Path,
allow_shutdown: bool,
parser: ArgumentParser,
password: typing.Optional[str] = None,
user_name: str = DEFAULT_WA_USER,
password: Optional[str] = None,
group_name: str = DEFAULT_JOB_GROUP,
no_install_service: bool = False,
start: bool = False,
install_service: bool = False,
start_service: bool = False,
confirm: bool = False,
telemetry_opt_out: bool = False,
):
Expand All @@ -469,8 +626,6 @@ def print_helping_info_and_exit():
elif not validate_deadline_id("fleet", fleet_id):
logging.error(f"Not a valid value for Fleet id: {fleet_id}")
print_helping_info_and_exit()
if not password:
password = generate_password()

# Check that user has Administrator privileges
if not shell.IsUserAnAdmin():
Expand All @@ -479,6 +634,18 @@ def print_helping_info_and_exit():

# Print configuration
print_banner()

if not password:
if check_user_existence(user_name):
password = getpass("Agent user password: ")
try:
WindowsSessionUser(user_name, password=password)
except BadCredentialsException:
print("ERROR: Password incorrect")
sys.exit(1)
else:
password = generate_password()

print(
f"Farm ID: {farm_id}\n"
f"Fleet ID: {fleet_id}\n"
Expand All @@ -487,9 +654,11 @@ def print_helping_info_and_exit():
f"Worker job group: {group_name}\n"
f"Worker agent program path: {str(worker_agent_program)}\n"
f"Allow worker agent shutdown: {allow_shutdown}\n"
f"Start service: {start}\n"
f"Install Windows service: {install_service}\n"
f"Start service: {start_service}"
f"Telemetry opt-out: {telemetry_opt_out}"
)
print()

# Confirm installation
if not confirm:
Expand All @@ -515,9 +684,11 @@ def print_helping_info_and_exit():

# Check if the job group exists, and create it if not
ensure_local_queue_user_group_exists(group_name)

# Add the worker agent user to the job group
add_user_to_group(group_name, user_name)

# Create directories and configure their permissions
agent_dirs = provision_directories(user_name)
update_config_file(
str(agent_dirs.deadline_config_subdir),
Expand All @@ -539,3 +710,14 @@ def print_helping_info_and_exit():
settings={"telemetry.opt_out": "true"},
)
logging.info("Opted out of client telemetry")

# Install the Windows service if specified
if install_service:
_install_service(
agent_user_name=user_name,
password=password,
)

# Start the Windows service if specified
if start_service:
_start_service()
Loading

0 comments on commit 1d97970

Please sign in to comment.