From cd5a2a97c5dd740dc16ee8a0b2e99c412872c893 Mon Sep 17 00:00:00 2001 From: David Hageman Date: Sat, 17 Feb 2024 16:57:31 -0600 Subject: [PATCH] Add -ness checks and refactor migrations --- config/crd/bases/awx.ansible.com_awxs.yaml | 80 ++++++++++++++++ config/rbac/role.yaml | 11 +++ .../container-probes.md | 40 ++++++++ roles/installer/tasks/install.yml | 44 +-------- .../tasks/resources_configuration.yml | 95 +++++++++++++++++-- .../templates/deployments/task.yaml.j2 | 47 ++++++++- .../templates/deployments/web.yaml.j2 | 25 +++++ .../installer/templates/job/migration.yaml.j2 | 55 +++++++++++ 8 files changed, 345 insertions(+), 52 deletions(-) create mode 100644 docs/user-guide/advanced-configuration/container-probes.md create mode 100644 roles/installer/templates/job/migration.yaml.j2 diff --git a/config/crd/bases/awx.ansible.com_awxs.yaml b/config/crd/bases/awx.ansible.com_awxs.yaml index e3387b26e..48d4b1fe4 100644 --- a/config/crd/bases/awx.ansible.com_awxs.yaml +++ b/config/crd/bases/awx.ansible.com_awxs.yaml @@ -1571,6 +1571,86 @@ spec: description: Number of task instance replicas type: integer format: int32 + web_liveness_initial_delay: + description: Initial delay before starting liveness checks on web pod + type: integer + default: 5 + format: int32 + task_liveness_initial_delay: + description: Initial delay before starting liveness checks on task pod + type: integer + default: 5 + format: int32 + web_liveness_period: + description: Time period in seconds between each liveness check for the web pod + type: integer + default: 0 + format: int32 + task_liveness_period: + description: Time period in seconds between each liveness check for the task pod + type: integer + default: 0 + format: int32 + web_liveness_failure_threshold: + description: Number of consecutive failure events to identify failure of web pod + type: integer + default: 3 + format: int32 + task_liveness_failure_threshold: + description: Number of consecutive failure events to identify failure of task pod + type: integer + default: 3 + format: int32 + web_liveness_timeout: + description: Number of seconds to wait for a probe response from web pod + type: integer + default: 1 + format: int32 + task_liveness_timeout: + description: Number of seconds to wait for a probe response from task pod + type: integer + default: 1 + format: int32 + web_readiness_initial_delay: + description: Initial delay before starting readiness checks on web pod + type: integer + default: 20 + format: int32 + task_readiness_initial_delay: + description: Initial delay before starting readiness checks on task pod + type: integer + default: 20 + format: int32 + web_readiness_period: + description: Time period in seconds between each readiness check for the web pod + type: integer + default: 0 + format: int32 + task_readiness_period: + description: Time period in seconds between each readiness check for the task pod + type: integer + default: 0 + format: int32 + web_readiness_failure_threshold: + description: Number of consecutive failure events to identify failure of web pod + type: integer + default: 3 + format: int32 + task_readiness_failure_threshold: + description: Number of consecutive failure events to identify failure of task pod + type: integer + default: 3 + format: int32 + web_readiness_timeout: + description: Number of seconds to wait for a probe response from web pod + type: integer + default: 1 + format: int32 + task_readiness_timeout: + description: Number of seconds to wait for a probe response from task pod + type: integer + default: 1 + format: int32 garbage_collect_secrets: description: Whether or not to remove secrets upon instance removal default: false diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 9d2af0ce2..e5c9b59a9 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -66,6 +66,17 @@ rules: - patch - update - watch + - apiGroups: + - batch + resources: + - jobs + verbs: + - get + - list + - create + - patch + - update + - watch - apiGroups: - networking.k8s.io resources: diff --git a/docs/user-guide/advanced-configuration/container-probes.md b/docs/user-guide/advanced-configuration/container-probes.md new file mode 100644 index 000000000..338e53773 --- /dev/null +++ b/docs/user-guide/advanced-configuration/container-probes.md @@ -0,0 +1,40 @@ +#### Container Probes +These parameters control the usage of liveness and readiness container probes for +the web and task containers. + +#### Web / Task Container Liveness Check + +The liveness probe queries the status of the supervisor daemon of the container. The probe will fail if it +detects one of the services in a state other than "RUNNING". + +| Name | Description | Default | +| web_liveness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 | +| web_liveness_initial_delay | Initial delay before starting probes in seconds | 5 | +| web_liveness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 | +| web_liveness_timeout | Number of seconds to wait for a probe response from container | 1 | +| task_liveness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 | +| task_liveness_initial_delay | Initial delay before starting probes in seconds | 5 | +| task_liveness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 | +| task_liveness_timeout | Number of seconds to wait for a probe response from container | 1 | + +#### Web Container Readiness Check + +This is a HTTP check against the status endpoint to confirm the system is still able to respond to web requests. + +| Name | Description | Default | +| -------------| ---------------------------------- | ------- | +| web_readiness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 | +| web_readiness_initial_delay | Initial delay before starting probes in seconds | 5 | +| web_readiness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 | +| web_readiness_timeout | Number of seconds to wait for a probe response from container | 1 | + +#### Task Container Readiness Check + +This is a command probe using the builtin check command of the awx-manage utility. + +| Name | Description | Default | +| -------------| ---------------------------------- | ------- | +| task_readiness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 | +| task_readiness_initial_delay | Initial delay before starting probes in seconds | 5 | +| task_readiness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 | +| task_readiness_timeout | Number of seconds to wait for a probe response from container | 1 | diff --git a/roles/installer/tasks/install.yml b/roles/installer/tasks/install.yml index 2398ebb4d..a17bf73b8 100644 --- a/roles/installer/tasks/install.yml +++ b/roles/installer/tasks/install.yml @@ -91,51 +91,9 @@ ignore_errors: yes changed_when: false -- name: Include resources configuration tasks +- name: Include resources configuration and database schema migration tasks include_tasks: resources_configuration.yml -- name: Check for pending migrations - k8s_exec: - namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" - command: >- - bash -c "awx-manage showmigrations | grep -v '[X]' | grep '[ ]' | wc -l" - changed_when: false - when: awx_task_pod_name != '' - register: database_check - -- name: Migrate the database if the K8s resources were updated # noqa 305 - k8s_exec: - namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" - command: | - bash -c " - function end_keepalive { - rc=$? - rm -f \"$1\" - kill $(cat /proc/$2/task/$2/children 2>/dev/null) 2>/dev/null || true - wait $2 || true - exit $rc - } - keepalive_file=\"$(mktemp)\" - while [[ -f \"$keepalive_file\" ]]; do - echo 'Database schema migration in progress...' - sleep 60 - done & - keepalive_pid=$! - trap 'end_keepalive \"$keepalive_file\" \"$keepalive_pid\"' EXIT SIGINT SIGTERM - echo keepalive_pid: $keepalive_pid - awx-manage migrate --noinput - echo 'Successful' - " - register: migrate_result - when: - - awx_task_pod_name != '' - - database_check is defined - - (database_check.stdout|trim) != '0' - - name: Initialize Django include_tasks: initialize_django.yml when: awx_task_pod_name != '' diff --git a/roles/installer/tasks/resources_configuration.yml b/roles/installer/tasks/resources_configuration.yml index c811aeeb7..d9a0552ec 100644 --- a/roles/installer/tasks/resources_configuration.yml +++ b/roles/installer/tasks/resources_configuration.yml @@ -12,7 +12,7 @@ - status.phase=Running register: awx_task_pod -- name: Set the resource pod as a variable. +- name: Set the resource task pod as a variable. set_fact: awx_task_pod: >- {{ awx_task_pod['resources'] @@ -20,7 +20,7 @@ | sort(attribute='metadata.creationTimestamp') | first | default({}) }} -- name: Set the resource pod name as a variable. +- name: Set the resource task pod name as a variable. set_fact: awx_task_pod_name: "{{ awx_task_pod['metadata']['name'] | default('') }}" @@ -249,15 +249,89 @@ k8s: apply: yes definition: "{{ lookup('template', 'deployments/{{ item }}.yaml.j2') }}" - wait: yes - wait_timeout: "{{ (120 * replicas) or 120 }}" loop: - - task - web + - task register: this_deployment_result - block: - - name: Get the new resource pod information after updating resource. + - name: Get the new web pod information after updating resource. + k8s_info: + kind: Pod + namespace: '{{ ansible_operator_meta.namespace }}' + label_selectors: + - "app.kubernetes.io/name={{ ansible_operator_meta.name }}-web" + - "app.kubernetes.io/managed-by={{ deployment_type }}-operator" + - "app.kubernetes.io/component={{ deployment_type }}" + field_selectors: + - status.phase=Running + register: _new_pod + + - name: Update new web pod as a variable. + set_fact: + awx_web_pod: >- + {{ _new_pod['resources'] + | rejectattr('metadata.deletionTimestamp', 'defined') + | sort(attribute='metadata.creationTimestamp') + | last | default({}) }} + + - name: Update new web pod name as a variable. + set_fact: + awx_web_pod_name: '{{ awx_web_pod["metadata"]["name"] | default("")}}' + + # We use the web pod to check for migrations because it should be in a running + # state. The task pod will be waiting in an init container waiting for + # the migrations to finish. + - name: Check for pending migrations + k8s_exec: + namespace: "{{ ansible_operator_meta.namespace }}" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" + command: >- + bash -c "awx-manage showmigrations | grep -v '[X]' | grep '[ ]' | wc -l" + changed_when: false + when: awx_web_pod_name != '' + register: database_check + + - name: Get version of controller for tracking migrations + k8s_exec: + namespace: "{{ ansible_operator_meta.namespace }}" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" + command: >- + bash -c "awx-manage --version" + changed_when: false + register: version_check + when: + - database_check is defined + - (database_check.stdout|trim) != '0' + + - name: Update instance version + set_fact: + version: "{{ version_check.stdout | trim }}" + + - name: Create migration job + k8s: + apply: yes + definition: "{{ lookup('template', 'job/migration.yaml.j2') }}" + register: migrate_result + when: + - database_check is defined + - (database_check.stdout|trim) != '0' + + - name: Watch for the migration job to finish + k8s_info: + kind: Job + namespace: "{{ ansible_operator_meta.namespace }}" + name: "{{ ansible_operator_meta.name }}-migration-{{ version }}" + register: result + until: + - result.resources[0].status.succeeded is defined + - result.resources[0].status.succeeded == 1 + retries: 180 + delay: 10 + + - name: Get the new task pod information after updating resource. k8s_info: kind: Pod namespace: '{{ ansible_operator_meta.namespace }}' @@ -267,9 +341,13 @@ - "app.kubernetes.io/component={{ deployment_type }}" field_selectors: - status.phase=Running + wait: true + wait_timeout: 30 + wait_condition: + type: Ready register: _new_pod - - name: Update new resource pod as a variable. + - name: Update new task pod as a variable. set_fact: awx_task_pod: >- {{ _new_pod['resources'] @@ -277,9 +355,10 @@ | sort(attribute='metadata.creationTimestamp') | last | default({}) }} - - name: Update new resource pod name as a variable. + - name: Update new task pod name as a variable. set_fact: awx_task_pod_name: '{{ awx_task_pod["metadata"]["name"] | default("")}}' + when: - this_deployment_result.changed diff --git a/roles/installer/templates/deployments/task.yaml.j2 b/roles/installer/templates/deployments/task.yaml.j2 index cb57fbd13..af5c4b848 100644 --- a/roles/installer/templates/deployments/task.yaml.j2 +++ b/roles/installer/templates/deployments/task.yaml.j2 @@ -74,7 +74,28 @@ spec: priorityClassName: '{{ control_plane_priority_class }}' {% endif %} initContainers: - - name: init + - name: init-database + image: '{{ _image }}' + imagePullPolicy: '{{ image_pull_policy }}' + resources: {{ init_container_resource_requirements }} + command: + - /bin/sh + - -c + - wait-for-migrations + volumeMounts: + - name: {{ ansible_operator_meta.name }}-application-credentials + mountPath: "/etc/tower/conf.d/credentials.py" + subPath: credentials.py + readOnly: true + - name: "{{ secret_key_secret_name }}" + mountPath: /etc/tower/SECRET_KEY + subPath: SECRET_KEY + readOnly: true + - name: {{ ansible_operator_meta.name }}-settings + mountPath: "/etc/tower/settings.py" + subPath: settings.py + readOnly: true + - name: init-receptor image: '{{ _init_container_image }}' imagePullPolicy: '{{ image_pull_policy }}' resources: {{ init_container_resource_requirements }} @@ -188,6 +209,30 @@ spec: {% endif %} {% if task_args %} args: {{ task_args }} +{% endif %} +{% if task_liveness_period|int > 0 %} + livenessProbe: + exec: + command: + - sh + - -c + - | + (exit $(/usr/bin/supervisorctl -c /etc/supervisord_task.conf status | grep -vc RUNNING)) + initialDelaySeconds: {{ task_liveness_initial_delay }} + periodSeconds: {{ task_liveness_period }} + failureThreshold: {{ task_liveness_failure_threshold }} + timeoutSeconds: {{ task_liveness_timeout }} +{% endif %} +{% if task_readiness_period|int > 0 %} + readinessProbe: + exec: + command: + - /usr/bin/awx-manage + - check + initialDelaySeconds: {{ task_readiness_initial_delay }} + periodSeconds: {{ task_readiness_period }} + failureThreshold: {{ task_readiness_failure_threshold }} + timeoutSeconds: {{ task_readiness_timeout }} {% endif %} volumeMounts: {% if bundle_ca_crt %} diff --git a/roles/installer/templates/deployments/web.yaml.j2 b/roles/installer/templates/deployments/web.yaml.j2 index 1a7318348..568a6d690 100644 --- a/roles/installer/templates/deployments/web.yaml.j2 +++ b/roles/installer/templates/deployments/web.yaml.j2 @@ -162,6 +162,31 @@ spec: - containerPort: 8052 {% if ingress_type | lower == 'route' and route_tls_termination_mechanism | lower == 'passthrough' %} - containerPort: 8053 +{% endif %} +{% if web_liveness_period|int > 0 %} + livenessProbe: + exec: + command: + - sh + - -c + - | + (exit $(/usr/bin/supervisorctl -c /etc/supervisord_task.conf status | grep -vc RUNNING)) + initialDelaySeconds: {{ web_liveness_initial_delay }} + periodSeconds: {{ web_liveness_period }} + failureThreshold: {{ web_liveness_failure_threshold }} + timeoutSeconds: {{ web_liveness_timeout }} +{% endif %} +{% if web_readiness_period|int > 0 %} + readinessProbe: + exec: + httpGet: + path: /api/v2/ping/ + scheme: HTTP + port: 8052 + initialDelaySeconds: {{ web_readiness_initial_delay }} + periodSeconds: {{ web_readiness_period }} + failureThreshold: {{ web_readiness_failure_threshold }} + timeoutSeconds: {{ web_readiness_timeout }} {% endif %} volumeMounts: {% if bundle_ca_crt %} diff --git a/roles/installer/templates/job/migration.yaml.j2 b/roles/installer/templates/job/migration.yaml.j2 new file mode 100644 index 000000000..a83062a10 --- /dev/null +++ b/roles/installer/templates/job/migration.yaml.j2 @@ -0,0 +1,55 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: '{{ ansible_operator_meta.name }}-migration-{{ version }}' + namespace: '{{ ansible_operator_meta.namespace }}' + labels: + {{ lookup("template", "../common/templates/labels/common.yaml.j2") | indent(width=4) | trim }} + {{ lookup("template", "../common/templates/labels/version.yaml.j2") | indent(width=4) | trim }} +spec: + template: + spec: + containers: + - name: "migration-job" + image: '{{ _image }}' + command: + - /usr/bin/awx-manage + - migrate + - --noinput + volumeMounts: + - name: {{ ansible_operator_meta.name }}-application-credentials + mountPath: "/etc/tower/conf.d/credentials.py" + subPath: credentials.py + readOnly: true + - name: "{{ secret_key_secret_name }}" + mountPath: /etc/tower/SECRET_KEY + subPath: SECRET_KEY + readOnly: true + - name: {{ ansible_operator_meta.name }}-settings + mountPath: "/etc/tower/settings.py" + subPath: settings.py + readOnly: true + volumes: + - name: "{{ ansible_operator_meta.name }}-application-credentials" + secret: + secretName: "{{ ansible_operator_meta.name }}-app-credentials" + items: + - key: credentials.py + path: 'credentials.py' + - key: execution_environments.py + path: 'execution_environments.py' + - name: "{{ secret_key_secret_name }}" + secret: + secretName: '{{ secret_key_secret_name }}' + items: + - key: secret_key + path: SECRET_KEY + - name: {{ ansible_operator_meta.name }}-settings + configMap: + name: '{{ ansible_operator_meta.name }}-{{ deployment_type }}-configmap' + items: + - key: settings + path: settings.py + dnsPolicy: ClusterFirst + restartPolicy: Never + terminationGracePeriodSeconds: 30