From d494c9d8b62c6a7a6f62d7400274514f22698531 Mon Sep 17 00:00:00 2001 From: Julien Girardin Date: Fri, 15 Sep 2023 14:35:12 +0200 Subject: [PATCH] Check health on control-plane before acting on cluster --- docs/variables.md | 1 + roles/preflight_check_cp/defaults/main.yml | 1 + .../tasks/check_control_plane_health.yml | 24 +++++++++++++++++++ roles/preflight_check_cp/tasks/main.yml | 5 ++++ 4 files changed, 31 insertions(+) create mode 100644 roles/preflight_check_cp/tasks/check_control_plane_health.yml diff --git a/docs/variables.md b/docs/variables.md index 7139467..b7a0c85 100644 --- a/docs/variables.md +++ b/docs/variables.md @@ -12,6 +12,7 @@ For hooks where a variable-per-hook is exposed, see [hooks && plugins](hooks_and | apiserver_manifest | control plane | "/etc/kubernetes/manifests/kube-apiserver.yaml" | filename to stat for presence in the process to discover already running control-plane | | cluster_config | control plane | {} | config to be used by kubeadm for the `kind: CluserConfiguration` | | control_plane_endpoint | control plane | "" (let kubeadm default) | control the "controlPlaneEndpoint" entry of the cluster_config. Could also be set as part of the cluster_config. Default to nothing but ansible-kubeadm will fail if not set in case of multi-control-plane nodes cluster | +| cp_health_check_bypass | control_plane | false | Bypass check on control-plane health | | enable_kubeadm_patches | control plane | true | Deploy patches and pass `kubeadm_patch_dir` to kubeadm so that patch are applied | | kube_control_plane_cidr | control plane | "" (let kubeadm default) | CIDR (eg "192.168.99.0/24") filter addresses for `_etcd_metrics_bind_address`, `_kube_apiserver_advertise_address`, `_kube_controller_manager_bind_address`, `_kube_scheduler_bind_address` | | kube_apiserver_advertise_cidr | control plane | "" (let kubeadm default) | CIDR (eg "192.168.99.0/24") filter the advertise address to `_kube_apiserver_advertise_address` (override `kube_control_plane_cidr`) | diff --git a/roles/preflight_check_cp/defaults/main.yml b/roles/preflight_check_cp/defaults/main.yml index 628cfca..7374a3c 100644 --- a/roles/preflight_check_cp/defaults/main.yml +++ b/roles/preflight_check_cp/defaults/main.yml @@ -2,6 +2,7 @@ _config_upgrade_reasons: {} _failure_reasons: {} _upgrade_reasons: {} +cp_health_check_bypass: false kube_version: default_kube_version: '1.19' diff --git a/roles/preflight_check_cp/tasks/check_control_plane_health.yml b/roles/preflight_check_cp/tasks/check_control_plane_health.yml new file mode 100644 index 0000000..9401d67 --- /dev/null +++ b/roles/preflight_check_cp/tasks/check_control_plane_health.yml @@ -0,0 +1,24 @@ +--- +- name: 'get kubeadm configmap if cluster running' + command: kubectl get nodes -o yaml + changed_when: false + check_mode: false + register: _all_nodes_yaml + environment: + KUBECONFIG: '{{ kubeconfig_admin }}' + +- name: 'Check control-plane health' + set_fact: + _failure_reasons: >- + {%- set all_nodes = (_all_nodes_yaml.stdout|from_yaml)|'items'] + |selectattr("metadata.labels.node-role\.kubernetes\.io/control-plane", "defined") -%} + {%- if all_nodes|map(attribute="status.conditions") + |map("selectattr", "type", "eq", "Ready") + |map("first") + |rejectattr("status", "eq", "True") + |list|length != 0 -%} + {%- set _ = _failure_reasons.dict(dict( + cp_health = "Some control plane are not healthy")) -%} + {%- endif -%} + {%- endif -%} + when: not cp_health_check_bypass|bool diff --git a/roles/preflight_check_cp/tasks/main.yml b/roles/preflight_check_cp/tasks/main.yml index d6aae95..dcbdb03 100644 --- a/roles/preflight_check_cp/tasks/main.yml +++ b/roles/preflight_check_cp/tasks/main.yml @@ -32,3 +32,8 @@ - import_tasks: check_version.yml - import_tasks: check_control_plane_endpoint.yml + +- import_tasks: check_control_plane_health.yml + when: + - groups.cp_running|default([])|length > 0 + - not cp_health_check_bypass|bool