Skip to content

Commit

Permalink
Kubernetes HA upgrades (#1456)
Browse files Browse the repository at this point in the history
* epicli/upgrade: reusing existing shared-config + cleanups

* upgrade: k8s HA upgrades minimal implementation

* upgrade: kubernetes cleanup and refactor

* Apply suggestions from code review

Co-authored-by: to-bar <[email protected]>

* upgrade: removing unneeded kubeconfig from k8s nodes (security fix)

* upgrade: statefulset patching refactor

* upgrade: cleanups and refactor for logs

* Make deployment manifest tasks more generic

* Improve detecting CNI plugin

* AnsibleVarsGenerator.py: fixing regression issue introducted during upgrade refactor

* Apply suggestions from code review

Co-authored-by: to-bar <[email protected]>

* upgrade: statefulset patching refactor

- patching all containers (fix)
- patching init containers also (fix)
- removing include_tasks statements (speedup)

* Ensure settings for backward compatibility

* Revert "Ensure settings for backward compatibility"

This reverts commit 5c9cdb6.

* AnsibleInventoryUpgrade.py: merging shared-config with defaults

* Adding changelog entry

* Revert "AnsibleVarsGenerator.py: fixing regression issue introducted during upgrade refactor"

This reverts commit c38eb9d.

* Revert "epicli/upgrade: reusing existing shared-config + cleanups"

This reverts commit e5957c5.

* AnsibleVarsGenerator.py: adding nicer way to handle shared config

Co-authored-by: to-bar <[email protected]>
  • Loading branch information
sk4zuzu and to-bar authored Jul 23, 2020
1 parent b19afc8 commit 6da1518
Show file tree
Hide file tree
Showing 52 changed files with 739 additions and 645 deletions.
1 change: 1 addition & 0 deletions CHANGELOG-0.7.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
- [#1399](https://github.com/epiphany-platform/epiphany/issues/1399) - Epicli upgrade: Kubernetes upgrade may hang
- [#1398](https://github.com/epiphany-platform/epiphany/issues/1398) - Vault installation fails when using canal/calico network plugin
- [#1412](https://github.com/epiphany-platform/epiphany/issues/1412) - Certificate in Vault is also generated or copied even if flag in configuration tls_disable is set to true
- [#1408](https://github.com/epiphany-platform/epiphany/issues/1408) - Epiphany does not support upgrades for Kubernetes in HA mode

### Added

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,4 @@ def upgrade(self):
# save new inventory
save_inventory(new_inventory, self.cluster_model, self.build_dir)

return 0
return 0
42 changes: 35 additions & 7 deletions core/src/epicli/cli/engine/ansible/AnsibleVarsGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
import copy

from cli.helpers.Step import Step
from cli.helpers.build_saver import get_ansible_path, get_ansible_path_for_build, get_ansible_vault_path
from cli.helpers.doc_list_helpers import select_first
from cli.helpers.build_saver import get_ansible_path, get_ansible_path_for_build, get_ansible_vault_path, MANIFEST_FILE_NAME
from cli.helpers.doc_list_helpers import select_first, select_single
from cli.helpers.naming_helpers import to_feature_name, to_role_name
from cli.helpers.ObjDict import ObjDict
from cli.helpers.yaml_helpers import dump
from cli.helpers.Config import Config
from cli.helpers.data_loader import load_yaml_obj, types, load_all_documents_from_folder
from cli.helpers.data_loader import load_yaml_obj, types, load_yamls_file, load_all_documents_from_folder

from cli.engine.schema.DefaultMerger import DefaultMerger


class AnsibleVarsGenerator(Step):
Expand Down Expand Up @@ -93,10 +95,14 @@ def populate_group_vars(self, ansible_dir):
main_vars['is_upgrade_run'] = self.is_upgrade_run
main_vars['roles_with_generated_vars'] = sorted(self.roles_with_generated_vars)

shared_config_doc = select_first(self.config_docs, lambda x: x.kind == 'configuration/shared-config')
if shared_config_doc == None:
if self.is_upgrade_run:
shared_config_doc = self.get_shared_config_from_manifest()
else:
shared_config_doc = select_first(self.config_docs, lambda x: x.kind == 'configuration/shared-config')

if shared_config_doc is None:
shared_config_doc = load_yaml_obj(types.DEFAULT, 'common', 'configuration/shared-config')

self.set_vault_path(shared_config_doc)
main_vars.update(shared_config_doc.specification)

Expand All @@ -115,7 +121,7 @@ def set_vault_path(self, shared_config):
shared_config.specification.vault_tmp_file_location = Config().vault_password_location
cluster_name = self.get_cluster_name()
shared_config.specification.vault_location = get_ansible_vault_path(cluster_name)

def get_cluster_name(self):
if 'name' in self.cluster_model.specification.keys():
return self.cluster_model.specification.name
Expand All @@ -128,6 +134,28 @@ def get_clean_cluster_model(self):
self.clear_object(cluster_model, 'credentials')
return cluster_model

def get_shared_config_from_manifest(self):
# Reuse shared config from existing manifest
# Shared config contains the use_ha_control_plane flag which is required during upgrades

path_to_manifest = os.path.join(self.inventory_upgrade.build_dir, MANIFEST_FILE_NAME)
if not os.path.isfile(path_to_manifest):
raise Exception('No manifest.yml inside the build folder')

manifest_docs = load_yamls_file(path_to_manifest)

cluster_model = select_single(manifest_docs, lambda x: x.kind == 'epiphany-cluster')

shared_config_doc = select_single(manifest_docs, lambda x: x.kind == 'configuration/shared-config')
shared_config_doc['provider'] = cluster_model['provider']

# Merge the shared config doc with defaults
with DefaultMerger([shared_config_doc]) as doc_merger:
shared_config_doc = doc_merger.run()[0]
del shared_config_doc['provider']

return shared_config_doc

def clear_object(self, obj_to_clean, key_to_clean):
for key, val in obj_to_clean.items():
if key == key_to_clean:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ kubelet_custom_config:
systemReserved:
cpu: 50m
memory: 768Mi # based on RedHat 7.5 on Standard_DS1_v2 Azure VM with =~ 30 pods

epiphany_manifests_dir: /etc/epiphany/manifests
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
include_tasks: deployments/deploy-template.yml

- name: Check if kubernetes-dashboard is already deployed
become_user: "{{ admin_user.name }}"
environment:
KUBECONFIG: "/home/{{ admin_user.name }}/.kube/config"
KUBECONFIG: /etc/kubernetes/admin.conf
shell: |
kubectl get pods \
--namespace kubernetes-dashboard \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +1,7 @@
---
- name: Apply network plugin configured by user
include_tasks: "./cni-plugins/{{ network_plugin }}.yml"
include_tasks: cni-plugins/{{ network_plugin }}.yml

# Wait for CNI plugin become ready to prevent failure of 'Get token from master' task on node before joining master
- name: Wait for CNI plugin become ready
shell: >-
kubectl wait --for=condition=Ready pods -l {{ selectors[network_plugin] }}
--field-selector=spec.nodeName=$(hostname --long) -n kube-system --timeout=10s
args:
executable: /bin/bash
environment:
KUBECONFIG: /home/{{ admin_user.name }}/.kube/config
register: wait_for_cni_plugin
until: wait_for_cni_plugin is succeeded
retries: 30
delay: 1
changed_when: false
vars:
selectors:
calico: k8s-app=calico-node
canal: k8s-app=canal
flannel: app=flannel
when:
- network_plugin in selectors.keys()
# Wait for CNI plugin to become ready to prevent failure of 'Get token from master' task on node before joining master
- name: Include wait-for-cni-plugin.yml
include_tasks: cni-plugins/wait-for-cni-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,5 @@
- name: Apply calico definition
environment:
KUBECONFIG: "/home/{{ admin_user.name }}/.kube/config"
command: kubectl apply -f /home/{{ admin_user.name }}/calico.yml
KUBECONFIG: /etc/kubernetes/admin.conf
command: kubectl apply -f /home/{{ admin_user.name }}/calico.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@
- name: Apply canal deployment
environment:
KUBECONFIG: "/home/{{ admin_user.name }}/.kube/config"
command: kubectl apply -f /home/{{ admin_user.name }}/canal.yml
KUBECONFIG: /etc/kubernetes/admin.conf
command: kubectl apply -f /home/{{ admin_user.name }}/canal.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@

- name: Apply flannel definition
environment:
KUBECONFIG: "/home/{{ admin_user.name }}/.kube/config"
command: kubectl apply -f /home/{{ admin_user.name }}/kube-flannel.yml
KUBECONFIG: /etc/kubernetes/admin.conf
command: kubectl apply -f /home/{{ admin_user.name }}/kube-flannel.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
# This file is meant to be also used by upgrade role

- name: Wait for CNI plugin to become ready
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
shell: >-
kubectl wait --for=condition=Ready pods -l {{ selectors[network_plugin] }}
--field-selector=spec.nodeName=$(hostname --long) -n kube-system --timeout=10s
args:
executable: /bin/bash
register: wait_for_cni_plugin
until: wait_for_cni_plugin is succeeded
retries: 30
delay: 1
changed_when: false
vars:
selectors:
calico: k8s-app=calico-node
canal: k8s-app=canal
flannel: app=flannel
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
---
- name: "Apply /etc/epiphany/manifests/{{ file_name }} file"
- name: Apply {{ file_path }} file
environment:
KUBECONFIG: "/home/{{ admin_user.name }}/.kube/config"
shell: |
kubectl apply \
-f /etc/epiphany/manifests/{{ file_name }}
args:
executable: /bin/bash
KUBECONFIG: /etc/kubernetes/admin.conf
shell: >-
kubectl apply -f {{ file_path }}
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,22 @@
- name: Create directory for files
become: true
file:
path: /etc/epiphany/manifests
path: "{{ epiphany_manifests_dir }}"
state: directory
owner: root
group: root
mode: u=rw,go=r
mode: u=rwx,go=r

- name: "Copy {{ file_name }}"
- name: Upload {{ file_name }} file
become: true
copy:
src: "{{ file_name }}"
dest: "/etc/epiphany/manifests/{{ file_name }}"
dest: "{{ epiphany_manifests_dir }}/{{ file_name }}"
owner: "{{ admin_user.name }}"
group: "{{ admin_user.name }}"
mode: u=rw,go=r

- name: Apply file
include_tasks: apply-file.yml
vars:
file_path: "{{ epiphany_manifests_dir }}/{{ file_name }}"
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,22 @@
- name: Create directory for files
become: true
file:
path: /etc/epiphany/manifests
path: "{{ epiphany_manifests_dir }}"
state: directory
owner: root
group: root
mode: u=rwx,go=r

- name: "Upload {{ file_name }} file"
- name: Upload {{ file_name }} file
become: true
template:
src: "{{ file_name }}"
dest: "/etc/epiphany/manifests/{{ file_name }}"
dest: "{{ epiphany_manifests_dir }}/{{ file_name | regex_replace('.j2$') }}"
owner: "{{ admin_user.name }}"
group: "{{ admin_user.name }}"
mode: u=rw,go=r

- name: Apply file
include_tasks: apply-file.yml
vars:
file_path: "{{ epiphany_manifests_dir }}/{{ file_name | regex_replace('.j2$') }}"

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,32 @@
- name: Reconfigure Docker for pulling images from local registry
block:
- name: image-registry | Drain node in preparation for Docker reconfiguration
include_tasks: kubernetes/node/drain.yml
include_tasks: kubernetes/utils/drain.yml
when:
- groups['kubernetes_node'] is defined
- inventory_hostname in groups['kubernetes_node']
- groups.kubernetes_node is defined
- inventory_hostname in groups.kubernetes_node

- name: image-registry | Wait for cluster's readiness
include_tasks: kubernetes/wait.yml
include_tasks: kubernetes/utils/wait.yml
when:
- groups['kubernetes_node'] is defined
- inventory_hostname in groups['kubernetes_node']
- groups.kubernetes_node is defined
- inventory_hostname in groups.kubernetes_node

- name: image-registry | Reconfigure Docker if necessary # this restarts Docker daemon
include_role:
name: docker
tasks_from: configure-docker

- name: Include wait-for-kube-apiserver.yml
include_tasks: kubernetes/wait-for-kube-apiserver.yml
include_tasks: kubernetes/utils/wait-for-kube-apiserver.yml
when:
- inventory_hostname in groups['kubernetes_master']
- inventory_hostname in groups.kubernetes_master

- name: image-registry | Uncordon node - mark node as schedulable
include_tasks: kubernetes/node/uncordon.yml
include_tasks: kubernetes/utils/uncordon.yml
when:
- groups['kubernetes_node'] is defined
- inventory_hostname in groups['kubernetes_node']
- groups.kubernetes_node is defined
- inventory_hostname in groups.kubernetes_node

when:
- not image_registry_address in result.stdout
- not image_registry_address in result.stdout
Original file line number Diff line number Diff line change
@@ -1,37 +1,48 @@
---
- name: Include wait-for-kube-apiserver.yml
import_tasks: kubernetes/wait-for-kube-apiserver.yml
delegate_to: "{{ groups['kubernetes_master'][0] }}"
- name: k8s | Wait for kube-apiserver then get cluster and kubelet version
delegate_to: "{{ groups.kubernetes_master[0] }}"
block:
- name: k8s | Include wait-for-kube-apiserver.yml
import_tasks: kubernetes/utils/wait-for-kube-apiserver.yml

- name: Include get-cluster-version.yml
import_tasks: kubernetes/get-cluster-version.yml # sets cluster_version
delegate_to: "{{ groups['kubernetes_master'][0] }}"
- name: k8s | Include get-cluster-version.yml
import_tasks: kubernetes/get-cluster-version.yml # sets cluster_version

- name: Check if upgrade from current K8s version is supported
- name: k8s | Check if upgrade from current K8s version is supported
assert:
that: cluster_version is version('v1.14.6', '>=')
fail_msg: Your Kubernetes version ({{ cluster_version }}) is not supported by this version of Epiphany which requires at least version 1.14.6 (Epiphany v0.4.4). For more information, refer to the documentation.
quiet: true

- name: Include get-kubelet-version.yml
- name: k8s | Include get-kubelet-version.yml
import_tasks: kubernetes/get-kubelet-version.yml # sets kubelet_version
delegate_to: "{{ groups['kubernetes_master'][0] }}"

- name: Upgrade master to v{{ version }}
include_tasks: kubernetes/upgrade-master.yml
- name: k8s | Upgrade masters then nodes
vars:
version: "{{ ver }}"
cni_version: "{{ cni_ver }}"
when:
- groups['kubernetes_master'][0] == inventory_hostname
- cluster_version is version('v' + version, '<=')
block:
- name: k8s | Upgrade masters
when: cluster_version is version('v' + version, '<=')
block:
- name: k8s | Upgrade first master to v{{ version }}
include_tasks: kubernetes/upgrade-master0.yml
when:
- inventory_hostname == groups.kubernetes_master[0]

- name: Upgrade node to v{{ version }}
include_tasks: kubernetes/upgrade-node.yml
vars:
version: "{{ ver }}"
cni_version: "{{ cni_ver }}"
when:
- groups['kubernetes_node'] is defined
- inventory_hostname in groups['kubernetes_node']
- kubelet_version is version('v' + version, '<=')
- name: k8s | Upgrade next master to v{{ version }}
include_tasks: kubernetes/upgrade-masterN.yml
when:
- inventory_hostname in groups.kubernetes_master[1:]

- name: k8s | Upgrade nodes
when: kubelet_version is version('v' + version, '<=')
block:
- name: k8s | Upgrade node to v{{ version }}
include_tasks: kubernetes/upgrade-node.yml
when:
- groups.kubernetes_node is defined
- inventory_hostname in groups.kubernetes_node

# TODO: Create a flag file that the upgrade completed to not run it again for the same version next time
Loading

0 comments on commit 6da1518

Please sign in to comment.