Skip to content

Commit

Permalink
Add -ness checks and refactor migrations
Browse files Browse the repository at this point in the history
  • Loading branch information
dhageman committed Feb 17, 2024
1 parent fc11db4 commit cd5a2a9
Show file tree
Hide file tree
Showing 8 changed files with 345 additions and 52 deletions.
80 changes: 80 additions & 0 deletions config/crd/bases/awx.ansible.com_awxs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1571,6 +1571,86 @@ spec:
description: Number of task instance replicas
type: integer
format: int32
web_liveness_initial_delay:
description: Initial delay before starting liveness checks on web pod
type: integer
default: 5
format: int32
task_liveness_initial_delay:
description: Initial delay before starting liveness checks on task pod
type: integer
default: 5
format: int32
web_liveness_period:
description: Time period in seconds between each liveness check for the web pod
type: integer
default: 0
format: int32
task_liveness_period:
description: Time period in seconds between each liveness check for the task pod
type: integer
default: 0
format: int32
web_liveness_failure_threshold:
description: Number of consecutive failure events to identify failure of web pod
type: integer
default: 3
format: int32
task_liveness_failure_threshold:
description: Number of consecutive failure events to identify failure of task pod
type: integer
default: 3
format: int32
web_liveness_timeout:
description: Number of seconds to wait for a probe response from web pod
type: integer
default: 1
format: int32
task_liveness_timeout:
description: Number of seconds to wait for a probe response from task pod
type: integer
default: 1
format: int32
web_readiness_initial_delay:
description: Initial delay before starting readiness checks on web pod
type: integer
default: 20
format: int32
task_readiness_initial_delay:
description: Initial delay before starting readiness checks on task pod
type: integer
default: 20
format: int32
web_readiness_period:
description: Time period in seconds between each readiness check for the web pod
type: integer
default: 0
format: int32
task_readiness_period:
description: Time period in seconds between each readiness check for the task pod
type: integer
default: 0
format: int32
web_readiness_failure_threshold:
description: Number of consecutive failure events to identify failure of web pod
type: integer
default: 3
format: int32
task_readiness_failure_threshold:
description: Number of consecutive failure events to identify failure of task pod
type: integer
default: 3
format: int32
web_readiness_timeout:
description: Number of seconds to wait for a probe response from web pod
type: integer
default: 1
format: int32
task_readiness_timeout:
description: Number of seconds to wait for a probe response from task pod
type: integer
default: 1
format: int32
garbage_collect_secrets:
description: Whether or not to remove secrets upon instance removal
default: false
Expand Down
11 changes: 11 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,17 @@ rules:
- patch
- update
- watch
- apiGroups:
- batch
resources:
- jobs
verbs:
- get
- list
- create
- patch
- update
- watch
- apiGroups:
- networking.k8s.io
resources:
Expand Down
40 changes: 40 additions & 0 deletions docs/user-guide/advanced-configuration/container-probes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#### Container Probes
These parameters control the usage of liveness and readiness container probes for
the web and task containers.

#### Web / Task Container Liveness Check

The liveness probe queries the status of the supervisor daemon of the container. The probe will fail if it
detects one of the services in a state other than "RUNNING".

| Name | Description | Default |
| web_liveness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 |
| web_liveness_initial_delay | Initial delay before starting probes in seconds | 5 |
| web_liveness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 |
| web_liveness_timeout | Number of seconds to wait for a probe response from container | 1 |
| task_liveness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 |
| task_liveness_initial_delay | Initial delay before starting probes in seconds | 5 |
| task_liveness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 |
| task_liveness_timeout | Number of seconds to wait for a probe response from container | 1 |

#### Web Container Readiness Check

This is a HTTP check against the status endpoint to confirm the system is still able to respond to web requests.

| Name | Description | Default |
| -------------| ---------------------------------- | ------- |
| web_readiness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 |
| web_readiness_initial_delay | Initial delay before starting probes in seconds | 5 |
| web_readiness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 |
| web_readiness_timeout | Number of seconds to wait for a probe response from container | 1 |

#### Task Container Readiness Check

This is a command probe using the builtin check command of the awx-manage utility.

| Name | Description | Default |
| -------------| ---------------------------------- | ------- |
| task_readiness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 |
| task_readiness_initial_delay | Initial delay before starting probes in seconds | 5 |
| task_readiness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 |
| task_readiness_timeout | Number of seconds to wait for a probe response from container | 1 |
44 changes: 1 addition & 43 deletions roles/installer/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,51 +91,9 @@
ignore_errors: yes
changed_when: false

- name: Include resources configuration tasks
- name: Include resources configuration and database schema migration tasks
include_tasks: resources_configuration.yml

- name: Check for pending migrations
k8s_exec:
namespace: "{{ ansible_operator_meta.namespace }}"
pod: "{{ awx_task_pod_name }}"
container: "{{ ansible_operator_meta.name }}-task"
command: >-
bash -c "awx-manage showmigrations | grep -v '[X]' | grep '[ ]' | wc -l"
changed_when: false
when: awx_task_pod_name != ''
register: database_check

- name: Migrate the database if the K8s resources were updated # noqa 305
k8s_exec:
namespace: "{{ ansible_operator_meta.namespace }}"
pod: "{{ awx_task_pod_name }}"
container: "{{ ansible_operator_meta.name }}-task"
command: |
bash -c "
function end_keepalive {
rc=$?
rm -f \"$1\"
kill $(cat /proc/$2/task/$2/children 2>/dev/null) 2>/dev/null || true
wait $2 || true
exit $rc
}
keepalive_file=\"$(mktemp)\"
while [[ -f \"$keepalive_file\" ]]; do
echo 'Database schema migration in progress...'
sleep 60
done &
keepalive_pid=$!
trap 'end_keepalive \"$keepalive_file\" \"$keepalive_pid\"' EXIT SIGINT SIGTERM
echo keepalive_pid: $keepalive_pid
awx-manage migrate --noinput
echo 'Successful'
"
register: migrate_result
when:
- awx_task_pod_name != ''
- database_check is defined
- (database_check.stdout|trim) != '0'

- name: Initialize Django
include_tasks: initialize_django.yml
when: awx_task_pod_name != ''
Expand Down
95 changes: 87 additions & 8 deletions roles/installer/tasks/resources_configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
- status.phase=Running
register: awx_task_pod

- name: Set the resource pod as a variable.
- name: Set the resource task pod as a variable.
set_fact:
awx_task_pod: >-
{{ awx_task_pod['resources']
| rejectattr('metadata.deletionTimestamp', 'defined')
| sort(attribute='metadata.creationTimestamp')
| first | default({}) }}
- name: Set the resource pod name as a variable.
- name: Set the resource task pod name as a variable.
set_fact:
awx_task_pod_name: "{{ awx_task_pod['metadata']['name'] | default('') }}"

Expand Down Expand Up @@ -249,15 +249,89 @@
k8s:
apply: yes
definition: "{{ lookup('template', 'deployments/{{ item }}.yaml.j2') }}"
wait: yes
wait_timeout: "{{ (120 * replicas) or 120 }}"
loop:
- task
- web
- task
register: this_deployment_result

- block:
- name: Get the new resource pod information after updating resource.
- name: Get the new web pod information after updating resource.
k8s_info:
kind: Pod
namespace: '{{ ansible_operator_meta.namespace }}'
label_selectors:
- "app.kubernetes.io/name={{ ansible_operator_meta.name }}-web"
- "app.kubernetes.io/managed-by={{ deployment_type }}-operator"
- "app.kubernetes.io/component={{ deployment_type }}"
field_selectors:
- status.phase=Running
register: _new_pod

- name: Update new web pod as a variable.
set_fact:
awx_web_pod: >-
{{ _new_pod['resources']
| rejectattr('metadata.deletionTimestamp', 'defined')
| sort(attribute='metadata.creationTimestamp')
| last | default({}) }}
- name: Update new web pod name as a variable.
set_fact:
awx_web_pod_name: '{{ awx_web_pod["metadata"]["name"] | default("")}}'

# We use the web pod to check for migrations because it should be in a running
# state. The task pod will be waiting in an init container waiting for
# the migrations to finish.
- name: Check for pending migrations
k8s_exec:
namespace: "{{ ansible_operator_meta.namespace }}"
pod: "{{ awx_web_pod_name }}"
container: "{{ ansible_operator_meta.name }}-web"
command: >-
bash -c "awx-manage showmigrations | grep -v '[X]' | grep '[ ]' | wc -l"
changed_when: false
when: awx_web_pod_name != ''
register: database_check

- name: Get version of controller for tracking migrations
k8s_exec:
namespace: "{{ ansible_operator_meta.namespace }}"
pod: "{{ awx_web_pod_name }}"
container: "{{ ansible_operator_meta.name }}-web"
command: >-
bash -c "awx-manage --version"
changed_when: false
register: version_check
when:
- database_check is defined

Check warning on line 306 in roles/installer/tasks/resources_configuration.yml

View workflow job for this annotation

GitHub Actions / molecule (--skip-tags=replicas)

306:7 [indentation] wrong indentation: expected 8 but found 6

Check warning on line 306 in roles/installer/tasks/resources_configuration.yml

View workflow job for this annotation

GitHub Actions / molecule (-t replicas)

306:7 [indentation] wrong indentation: expected 8 but found 6
- (database_check.stdout|trim) != '0'

- name: Update instance version
set_fact:
version: "{{ version_check.stdout | trim }}"

- name: Create migration job
k8s:
apply: yes
definition: "{{ lookup('template', 'job/migration.yaml.j2') }}"
register: migrate_result
when:
- database_check is defined

Check warning on line 319 in roles/installer/tasks/resources_configuration.yml

View workflow job for this annotation

GitHub Actions / molecule (--skip-tags=replicas)

319:7 [indentation] wrong indentation: expected 8 but found 6

Check warning on line 319 in roles/installer/tasks/resources_configuration.yml

View workflow job for this annotation

GitHub Actions / molecule (-t replicas)

319:7 [indentation] wrong indentation: expected 8 but found 6
- (database_check.stdout|trim) != '0'

- name: Watch for the migration job to finish
k8s_info:
kind: Job
namespace: "{{ ansible_operator_meta.namespace }}"
name: "{{ ansible_operator_meta.name }}-migration-{{ version }}"
register: result
until:
- result.resources[0].status.succeeded is defined
- result.resources[0].status.succeeded == 1
retries: 180
delay: 10

- name: Get the new task pod information after updating resource.
k8s_info:
kind: Pod
namespace: '{{ ansible_operator_meta.namespace }}'
Expand All @@ -267,19 +341,24 @@
- "app.kubernetes.io/component={{ deployment_type }}"
field_selectors:
- status.phase=Running
wait: true
wait_timeout: 30
wait_condition:
type: Ready
register: _new_pod

- name: Update new resource pod as a variable.
- name: Update new task pod as a variable.
set_fact:
awx_task_pod: >-
{{ _new_pod['resources']
| rejectattr('metadata.deletionTimestamp', 'defined')
| sort(attribute='metadata.creationTimestamp')
| last | default({}) }}
- name: Update new resource pod name as a variable.
- name: Update new task pod name as a variable.
set_fact:
awx_task_pod_name: '{{ awx_task_pod["metadata"]["name"] | default("")}}'

when:
- this_deployment_result.changed

Expand Down
47 changes: 46 additions & 1 deletion roles/installer/templates/deployments/task.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,28 @@ spec:
priorityClassName: '{{ control_plane_priority_class }}'
{% endif %}
initContainers:
- name: init
- name: init-database
image: '{{ _image }}'
imagePullPolicy: '{{ image_pull_policy }}'
resources: {{ init_container_resource_requirements }}
command:
- /bin/sh
- -c
- wait-for-migrations
volumeMounts:
- name: {{ ansible_operator_meta.name }}-application-credentials
mountPath: "/etc/tower/conf.d/credentials.py"
subPath: credentials.py
readOnly: true
- name: "{{ secret_key_secret_name }}"
mountPath: /etc/tower/SECRET_KEY
subPath: SECRET_KEY
readOnly: true
- name: {{ ansible_operator_meta.name }}-settings
mountPath: "/etc/tower/settings.py"
subPath: settings.py
readOnly: true
- name: init-receptor
image: '{{ _init_container_image }}'
imagePullPolicy: '{{ image_pull_policy }}'
resources: {{ init_container_resource_requirements }}
Expand Down Expand Up @@ -188,6 +209,30 @@ spec:
{% endif %}
{% if task_args %}
args: {{ task_args }}
{% endif %}
{% if task_liveness_period|int > 0 %}
livenessProbe:
exec:
command:
- sh
- -c
- |
(exit $(/usr/bin/supervisorctl -c /etc/supervisord_task.conf status | grep -vc RUNNING))
initialDelaySeconds: {{ task_liveness_initial_delay }}
periodSeconds: {{ task_liveness_period }}
failureThreshold: {{ task_liveness_failure_threshold }}
timeoutSeconds: {{ task_liveness_timeout }}
{% endif %}
{% if task_readiness_period|int > 0 %}
readinessProbe:
exec:
command:
- /usr/bin/awx-manage
- check
initialDelaySeconds: {{ task_readiness_initial_delay }}
periodSeconds: {{ task_readiness_period }}
failureThreshold: {{ task_readiness_failure_threshold }}
timeoutSeconds: {{ task_readiness_timeout }}
{% endif %}
volumeMounts:
{% if bundle_ca_crt %}
Expand Down
Loading

0 comments on commit cd5a2a9

Please sign in to comment.