diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..812954650 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +exclude=.git,__pycache__ +max-line-length=120 diff --git a/.gitignore b/.gitignore index e09b6644b..af02ef71f 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,59 @@ scm-source.json # diagrams *.aux *.log + +# Python +# Adapted from https://github.com/github/gitignore/blob/master/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot diff --git a/.travis.yml b/.travis.yml index 0fd48a9ca..7e77ffb41 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,8 +15,9 @@ before_install: - go get github.com/mattn/goveralls install: - - make deps + - make deps e2e-tools e2e-build script: - hack/verify-codegen.sh - travis_wait 20 goveralls -service=travis-ci -package ./pkg/... -v + - make e2e-run diff --git a/Makefile b/Makefile index 5b27281c2..5d97c817a 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: clean local test linux macos docker push scm-source.json +.PHONY: clean local test linux macos docker push scm-source.json e2e-run e2e-tools e2e-build BINARY ?= postgres-operator BUILD_FLAGS ?= -v @@ -34,7 +34,7 @@ ifdef CDP_PULL_REQUEST_NUMBER CDP_TAG := -${CDP_BUILD_VERSION} endif - +KIND_PATH := $(GOPATH)/bin PATH := $(GOPATH)/bin:$(PATH) SHELL := env PATH=$(PATH) $(SHELL) @@ -91,3 +91,16 @@ deps: test: hack/verify-codegen.sh @go test ./... + +e2e-build: + docker build --tag="postgres-operator-e2e-tests" -f e2e/Dockerfile . + +e2e-tools: + # install pinned version of 'kind' + # leave the name as is to avoid overwriting official binary named `kind` + wget https://github.com/kubernetes-sigs/kind/releases/download/v0.3.0/kind-linux-amd64 + chmod +x kind-linux-amd64 + mv kind-linux-amd64 $(KIND_PATH) + +e2e-run: docker + e2e/run.sh diff --git a/delivery.yaml b/delivery.yaml index 1866486f8..a60a656b1 100644 --- a/delivery.yaml +++ b/delivery.yaml @@ -11,7 +11,7 @@ pipeline: apt-get update - desc: 'Install required build software' cmd: | - apt-get install -y make git apt-transport-https ca-certificates curl build-essential + apt-get install -y make git apt-transport-https ca-certificates curl build-essential python3 python3-pip - desc: 'Install go' cmd: | cd /tmp @@ -41,6 +41,10 @@ pipeline: export PATH=$PATH:$HOME/go/bin cd $OPERATOR_TOP_DIR/postgres-operator go test ./... + - desc: 'Run e2e tests' + cmd: | + cd $OPERATOR_TOP_DIR/postgres-operator + make e2e-tools e2e-build e2e-run - desc: 'Push docker image' cmd: | export PATH=$PATH:$HOME/go/bin diff --git a/docs/developer.md b/docs/developer.md index e181357ab..a7c7a2f33 100644 --- a/docs/developer.md +++ b/docs/developer.md @@ -315,6 +315,16 @@ Then you can for example check the Patroni logs: kubectl logs acid-minimal-cluster-0 ``` +## End-to-end tests + +The operator provides reference e2e (end-to-end) tests to ensure various infra parts work smoothly together. +Each e2e execution tests a Postgres operator image built from the current git branch. The test runner starts a [kind](https://kind.sigs.k8s.io/) (local k8s) cluster and Docker container with tests. The k8s API client from within the container connects to the `kind` cluster using the standard Docker `bridge` network. +The tests utilize examples from `/manifests` (ConfigMap is used for the operator configuration) to avoid maintaining yet another set of configuration files. The kind cluster is deleted if tests complete successfully. + +End-to-end tests are executed automatically during builds; to invoke them locally use `make e2e-run` from the project's top directory. Run `make e2e-tools e2e-build` to install `kind` and build the tests' image locally before the first run. + +End-to-end tests are written in Python and use `flake8` for code quality. Please run flake8 [before submitting a PR](http://flake8.pycqa.org/en/latest/user/using-hooks.html). + ## Introduce additional configuration parameters In the case you want to add functionality to the operator that shall be diff --git a/e2e/Dockerfile b/e2e/Dockerfile new file mode 100644 index 000000000..bd646b677 --- /dev/null +++ b/e2e/Dockerfile @@ -0,0 +1,22 @@ +FROM ubuntu:18.04 +LABEL maintainer="Team ACID @ Zalando " + +WORKDIR /e2e + +COPY manifests ./manifests +COPY e2e/requirements.txt e2e/tests ./ + +RUN apt-get update \ + && apt-get install --no-install-recommends -y \ + python3 \ + python3-setuptools \ + python3-pip \ + curl \ + && pip3 install --no-cache-dir -r requirements.txt \ + && curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.14.0/bin/linux/amd64/kubectl \ + && chmod +x ./kubectl \ + && mv ./kubectl /usr/local/bin/kubectl \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +CMD ["python3", "-m", "unittest", "discover", "--start-directory", ".", "-v"] \ No newline at end of file diff --git a/e2e/kind-cluster-postgres-operator-e2e-tests.yaml b/e2e/kind-cluster-postgres-operator-e2e-tests.yaml new file mode 100644 index 000000000..a59746fd3 --- /dev/null +++ b/e2e/kind-cluster-postgres-operator-e2e-tests.yaml @@ -0,0 +1,6 @@ +kind: Cluster +apiVersion: kind.sigs.k8s.io/v1alpha3 +nodes: +- role: control-plane +- role: worker +- role: worker diff --git a/e2e/requirements.txt b/e2e/requirements.txt new file mode 100644 index 000000000..68a8775ff --- /dev/null +++ b/e2e/requirements.txt @@ -0,0 +1,3 @@ +kubernetes==9.0.0 +timeout_decorator==0.4.1 +pyyaml==5.1 diff --git a/e2e/run.sh b/e2e/run.sh new file mode 100755 index 000000000..3ee272979 --- /dev/null +++ b/e2e/run.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash + +# enable unofficial bash strict mode +set -o errexit +set -o nounset +set -o pipefail +IFS=$'\n\t' + +readonly cluster_name="postgres-operator-e2e-tests" +readonly operator_image=$(docker images --filter=reference="registry.opensource.zalan.do/acid/postgres-operator" --format "{{.Repository}}:{{.Tag}}" | head -1) +readonly e2e_test_image=${cluster_name} +readonly kubeconfig_path="/tmp/kind-config-${cluster_name}" + + +function start_kind(){ + + # avoid interference with previous test runs + if [[ $(kind-linux-amd64 get clusters | grep "^${cluster_name}*") != "" ]] + then + kind-linux-amd64 delete cluster --name ${cluster_name} + fi + + kind-linux-amd64 create cluster --name ${cluster_name} --config ./e2e/kind-cluster-postgres-operator-e2e-tests.yaml + kind-linux-amd64 load docker-image "${operator_image}" --name ${cluster_name} + KUBECONFIG="$(kind-linux-amd64 get kubeconfig-path --name=${cluster_name})" + export KUBECONFIG +} + +function set_kind_api_server_ip(){ + # use the actual kubeconfig to connect to the 'kind' API server + # but update the IP address of the API server to the one from the Docker 'bridge' network + cp "${KUBECONFIG}" /tmp + readonly local kind_api_server_port=6443 # well-known in the 'kind' codebase + readonly local kind_api_server=$(docker inspect --format "{{ .NetworkSettings.IPAddress }}:${kind_api_server_port}" "${cluster_name}"-control-plane) + sed -i "s/server.*$/server: https:\/\/$kind_api_server/g" "${kubeconfig_path}" +} + +function run_tests(){ + docker run --rm --mount type=bind,source="$(readlink -f ${kubeconfig_path})",target=/root/.kube/config -e OPERATOR_IMAGE="${operator_image}" "${e2e_test_image}" +} + +function clean_up(){ + unset KUBECONFIG + kind-linux-amd64 delete cluster --name ${cluster_name} + rm -rf ${kubeconfig_path} +} + +function main(){ + + trap "clean_up" QUIT TERM EXIT + + start_kind + set_kind_api_server_ip + run_tests + exit 0 +} + +main "$@" diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py new file mode 100644 index 000000000..c232ba7ac --- /dev/null +++ b/e2e/tests/test_e2e.py @@ -0,0 +1,327 @@ +import unittest +import time +import timeout_decorator +import subprocess +import warnings +import os +import yaml + +from kubernetes import client, config + + +class EndToEndTestCase(unittest.TestCase): + ''' + Test interaction of the operator with multiple k8s components. + ''' + + # `kind` pods may stuck in the `Terminating` phase for a few minutes; hence high test timeout + TEST_TIMEOUT_SEC = 600 + + @classmethod + @timeout_decorator.timeout(TEST_TIMEOUT_SEC) + def setUpClass(cls): + ''' + Deploy operator to a "kind" cluster created by /e2e/run.sh using examples from /manifests. + This operator deployment is to be shared among all tests. + + /e2e/run.sh deletes the 'kind' cluster after successful run along with all operator-related entities. + In the case of test failure the cluster will stay to enable manual examination; + next invocation of "make e2e-run" will re-create it. + ''' + + # set a single k8s wrapper for all tests + k8s = cls.k8s = K8s() + + # operator deploys pod service account there on start up + # needed for test_multi_namespace_support() + cls.namespace = "test" + v1_namespace = client.V1Namespace(metadata=client.V1ObjectMeta(name=cls.namespace)) + k8s.api.core_v1.create_namespace(v1_namespace) + + # submit the most recent operator image built on the Docker host + with open("manifests/postgres-operator.yaml", 'r+') as f: + operator_deployment = yaml.safe_load(f) + operator_deployment["spec"]["template"]["spec"]["containers"][0]["image"] = os.environ['OPERATOR_IMAGE'] + yaml.dump(operator_deployment, f, Dumper=yaml.Dumper) + + for filename in ["operator-service-account-rbac.yaml", + "configmap.yaml", + "postgres-operator.yaml"]: + k8s.create_with_kubectl("manifests/" + filename) + + k8s.wait_for_operator_pod_start() + + actual_operator_image = k8s.api.core_v1.list_namespaced_pod( + 'default', label_selector='name=postgres-operator').items[0].spec.containers[0].image + print("Tested operator image: {}".format(actual_operator_image)) # shows up after tests finish + + k8s.create_with_kubectl("manifests/minimal-postgres-manifest.yaml") + k8s.wait_for_pod_start('spilo-role=master') + + @timeout_decorator.timeout(TEST_TIMEOUT_SEC) + def test_multi_namespace_support(self): + ''' + Create a customized Postgres cluster in a non-default namespace. + ''' + k8s = self.k8s + + with open("manifests/complete-postgres-manifest.yaml", 'r+') as f: + pg_manifest = yaml.safe_load(f) + pg_manifest["metadata"]["namespace"] = self.namespace + yaml.dump(pg_manifest, f, Dumper=yaml.Dumper) + + k8s.create_with_kubectl("manifests/complete-postgres-manifest.yaml") + k8s.wait_for_pod_start("spilo-role=master", self.namespace) + self.assert_master_is_unique(self.namespace, version="acid-test-cluster") + + @timeout_decorator.timeout(TEST_TIMEOUT_SEC) + def test_scaling(self): + """ + Scale up from 2 to 3 and back to 2 pods by updating the Postgres manifest at runtime. + """ + + k8s = self.k8s + labels = "version=acid-minimal-cluster" + + k8s.wait_for_pg_to_scale(3) + self.assertEqual(3, k8s.count_pods_with_label(labels)) + self.assert_master_is_unique() + + k8s.wait_for_pg_to_scale(2) + self.assertEqual(2, k8s.count_pods_with_label(labels)) + self.assert_master_is_unique() + + @timeout_decorator.timeout(TEST_TIMEOUT_SEC) + def test_taint_based_eviction(self): + """ + Add taint "postgres=:NoExecute" to node with master. This must cause a failover. + """ + k8s = self.k8s + cluster_label = 'version=acid-minimal-cluster' + + # get nodes of master and replica(s) (expected target of new master) + current_master_node, failover_targets = k8s.get_pg_nodes(cluster_label) + num_replicas = len(failover_targets) + + # if all pods live on the same node, failover will happen to other worker(s) + failover_targets = [x for x in failover_targets if x != current_master_node] + if len(failover_targets) == 0: + nodes = k8s.api.core_v1.list_node() + for n in nodes.items: + if "node-role.kubernetes.io/master" not in n.metadata.labels and n.metadata.name != current_master_node: + failover_targets.append(n.metadata.name) + + # taint node with postgres=:NoExecute to force failover + body = { + "spec": { + "taints": [ + { + "effect": "NoExecute", + "key": "postgres" + } + ] + } + } + + # patch node and test if master is failing over to one of the expected nodes + k8s.api.core_v1.patch_node(current_master_node, body) + k8s.wait_for_master_failover(failover_targets) + k8s.wait_for_pod_start('spilo-role=replica') + + new_master_node, new_replica_nodes = k8s.get_pg_nodes(cluster_label) + self.assertNotEqual(current_master_node, new_master_node, + "Master on {} did not fail over to one of {}".format(current_master_node, failover_targets)) + self.assertEqual(num_replicas, len(new_replica_nodes), + "Expected {} replicas, found {}".format(num_replicas, len(new_replica_nodes))) + self.assert_master_is_unique() + + # undo the tainting + body = { + "spec": { + "taints": [] + } + } + k8s.api.core_v1.patch_node(new_master_node, body) + + @timeout_decorator.timeout(TEST_TIMEOUT_SEC) + def test_logical_backup_cron_job(self): + """ + Ensure we can (a) create the cron job at user request for a specific PG cluster + (b) update the cluster-wide image for the logical backup pod + (c) delete the job at user request + + Limitations: + (a) Does not run the actual batch job because there is no S3 mock to upload backups to + (b) Assumes 'acid-minimal-cluster' exists as defined in setUp + """ + + k8s = self.k8s + + # create the cron job + schedule = "7 7 7 7 *" + pg_patch_enable_backup = { + "spec": { + "enableLogicalBackup": True, + "logicalBackupSchedule": schedule + } + } + k8s.api.custom_objects_api.patch_namespaced_custom_object( + "acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_patch_enable_backup) + k8s.wait_for_logical_backup_job_creation() + + jobs = k8s.get_logical_backup_job().items + self.assertEqual(1, len(jobs), "Expected 1 logical backup job, found {}".format(len(jobs))) + + job = jobs[0] + self.assertEqual(job.metadata.name, "logical-backup-acid-minimal-cluster", + "Expected job name {}, found {}" + .format("logical-backup-acid-minimal-cluster", job.metadata.name)) + self.assertEqual(job.spec.schedule, schedule, + "Expected {} schedule, found {}" + .format(schedule, job.spec.schedule)) + + # update the cluster-wide image of the logical backup pod + image = "test-image-name" + config_map_patch = { + "data": { + "logical_backup_docker_image": image, + } + } + k8s.api.core_v1.patch_namespaced_config_map("postgres-operator", "default", config_map_patch) + + operator_pod = k8s.api.core_v1.list_namespaced_pod( + 'default', label_selector="name=postgres-operator").items[0].metadata.name + k8s.api.core_v1.delete_namespaced_pod(operator_pod, "default") # restart reloads the conf + k8s.wait_for_operator_pod_start() + + jobs = k8s.get_logical_backup_job().items + actual_image = jobs[0].spec.job_template.spec.template.spec.containers[0].image + self.assertEqual(actual_image, image, + "Expected job image {}, found {}".format(image, actual_image)) + + # delete the logical backup cron job + pg_patch_disable_backup = { + "spec": { + "enableLogicalBackup": False, + } + } + k8s.api.custom_objects_api.patch_namespaced_custom_object( + "acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_patch_disable_backup) + k8s.wait_for_logical_backup_job_deletion() + jobs = k8s.get_logical_backup_job().items + self.assertEqual(0, len(jobs), + "Expected 0 logical backup jobs, found {}".format(len(jobs))) + + def assert_master_is_unique(self, namespace='default', version="acid-minimal-cluster"): + """ + Check that there is a single pod in the k8s cluster with the label "spilo-role=master" + To be called manually after operations that affect pods + """ + + k8s = self.k8s + labels = 'spilo-role=master,version=' + version + + num_of_master_pods = k8s.count_pods_with_label(labels, namespace) + self.assertEqual(num_of_master_pods, 1, "Expected 1 master pod, found {}".format(num_of_master_pods)) + + +class K8sApi: + + def __init__(self): + + # https://github.com/kubernetes-client/python/issues/309 + warnings.simplefilter("ignore", ResourceWarning) + + self.config = config.load_kube_config() + self.k8s_client = client.ApiClient() + + self.core_v1 = client.CoreV1Api() + self.apps_v1 = client.AppsV1Api() + self.batch_v1_beta1 = client.BatchV1beta1Api() + self.custom_objects_api = client.CustomObjectsApi() + + +class K8s: + ''' + Wraps around K8 api client and helper methods. + ''' + + RETRY_TIMEOUT_SEC = 5 + + def __init__(self): + self.api = K8sApi() + + def get_pg_nodes(self, pg_cluster_name, namespace='default'): + master_pod_node = '' + replica_pod_nodes = [] + podsList = self.api.core_v1.list_namespaced_pod(namespace, label_selector=pg_cluster_name) + for pod in podsList.items: + if pod.metadata.labels.get('spilo-role') == 'master': + master_pod_node = pod.spec.node_name + elif pod.metadata.labels.get('spilo-role') == 'replica': + replica_pod_nodes.append(pod.spec.node_name) + + return master_pod_node, replica_pod_nodes + + def wait_for_operator_pod_start(self): + self. wait_for_pod_start("name=postgres-operator") + # HACK operator must register CRD / add existing PG clusters after pod start up + # for local execution ~ 10 seconds suffices + time.sleep(60) + + def wait_for_pod_start(self, pod_labels, namespace='default'): + pod_phase = 'No pod running' + while pod_phase != 'Running': + pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=pod_labels).items + if pods: + pod_phase = pods[0].status.phase + time.sleep(self.RETRY_TIMEOUT_SEC) + + def wait_for_pg_to_scale(self, number_of_instances, namespace='default'): + + body = { + "spec": { + "numberOfInstances": number_of_instances + } + } + _ = self.api.custom_objects_api.patch_namespaced_custom_object( + "acid.zalan.do", "v1", namespace, "postgresqls", "acid-minimal-cluster", body) + + labels = 'version=acid-minimal-cluster' + while self.count_pods_with_label(labels) != number_of_instances: + time.sleep(self.RETRY_TIMEOUT_SEC) + + def count_pods_with_label(self, labels, namespace='default'): + return len(self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items) + + def wait_for_master_failover(self, expected_master_nodes, namespace='default'): + pod_phase = 'Failing over' + new_master_node = '' + labels = 'spilo-role=master,version=acid-minimal-cluster' + + while (pod_phase != 'Running') or (new_master_node not in expected_master_nodes): + pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items + if pods: + new_master_node = pods[0].spec.node_name + pod_phase = pods[0].status.phase + time.sleep(self.RETRY_TIMEOUT_SEC) + + def get_logical_backup_job(self, namespace='default'): + return self.api.batch_v1_beta1.list_namespaced_cron_job(namespace, label_selector="application=spilo") + + def wait_for_logical_backup_job(self, expected_num_of_jobs): + while (len(self.get_logical_backup_job().items) != expected_num_of_jobs): + time.sleep(self.RETRY_TIMEOUT_SEC) + + def wait_for_logical_backup_job_deletion(self): + self.wait_for_logical_backup_job(expected_num_of_jobs=0) + + def wait_for_logical_backup_job_creation(self): + self.wait_for_logical_backup_job(expected_num_of_jobs=1) + + def create_with_kubectl(self, path): + subprocess.run(["kubectl", "create", "-f", path]) + + +if __name__ == '__main__': + unittest.main()