From 477fdebc953a076f044a304a03dec920ebd72bed Mon Sep 17 00:00:00 2001 From: Jirka Kremser <jkremser@redhat.com> Date: Fri, 12 Oct 2018 20:28:34 +0200 Subject: [PATCH 01/11] New stage for testing the operator restarts --- .travis.yml | 23 +++++ .travis/.travis.test-restarts.sh | 170 +++++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100755 .travis/.travis.test-restarts.sh diff --git a/.travis.yml b/.travis.yml index 18d8f82e..5f87c7f9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ install: true stages: - test - test-oc-and-k8s + - test-restarts - deploy jobs: @@ -43,6 +44,28 @@ jobs: env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2 script: *kc-script-defaults + - stage: test-restarts + env: BIN=oc VERSION=v3.9.0 CRD=0 + script: &oc-restarts-script-defaults + - make build-travis + - ./.travis/.travis.prepare.openshift.sh + - ./.travis/.travis.test-restarts.sh + + - stage: + env: BIN=oc VERSION=v3.9.0 CRD=1 + script: *oc-restarts-script-defaults + + - stage: + env: BIN=kubectl VERSION=v1.9.0 CRD=0 MINIKUBE_VERSION=v0.25.2 + script: &kc-restarts-script-defaults + - make build-travis + - ./.travis/.travis.prepare.minikube.sh + - ./.travis/.travis.test-restarts.sh + + - stage: + env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2 + script: *kc-restarts-script-defaults + - stage: deploy script: # release x.y.z or x.y.z-centos if there is a release diff --git a/.travis/.travis.test-restarts.sh b/.travis/.travis.test-restarts.sh new file mode 100755 index 00000000..bf16a331 --- /dev/null +++ b/.travis/.travis.test-restarts.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +DIR="${DIR:-$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )}" +BIN=${BIN:-oc} +if [ "$CRD" = "1" ]; then + CR="cr/" + KIND="sparkcluster" +else + CR="" + KIND="cm" +fi + +cluster_up() { + echo -e "\n$(tput setaf 3)docker images:$(tput sgr0)\n" + docker images + echo + if [ "$BIN" = "oc" ]; then + set -x + oc cluster up + [ "$CRD" = "1" ] && oc login -u system:admin + set +x + else + echo "minikube" + start_minikube + fi +} + +start_minikube() { + export CHANGE_MINIKUBE_NONE_USER=true + sudo minikube start --vm-driver=none --kubernetes-version=${VERSION} && \ + minikube update-context + os::cmd::try_until_text "${BIN} get nodes" '\sReady' + + kubectl cluster-info + + + # kube-addon-manager is responsible for managing other k8s components, such as kube-dns, dashboard, storage-provisioner.. + os::cmd::try_until_text "${BIN} -n kube-system get pod -lcomponent=kube-addon-manager -o yaml" 'ready: true' + + # Wait for kube-dns to be ready. + os::cmd::try_until_text "${BIN} -n kube-system get pod -lk8s-app=kube-dns -o yaml" 'ready: true' +} + +tear_down() { + docker kill `docker ps -q` || true +} + +setup_testing_framework() { + source "$(dirname "${BASH_SOURCE}")/../test/lib/init.sh" + os::util::environment::setup_time_vars +} + +logs() { + echo -e "\n$(tput setaf 3)oc get all:$(tput sgr0)\n" + ${BIN} get all + echo -e "\n$(tput setaf 3)Logs:$(tput sgr0)\n" + ${BIN} logs $operator_pod + echo +} + +errorLogs() { + echo -e "\n\n$(tput setaf 1)\n š± š± š±\nBUILD FAILED\n\nš± bad things have happened š±$(tput sgr0)" + logs + exit 1 +} + +info() { + ((testIndex++)) + echo "$(tput setaf 3)[$testIndex / $total] - Running ${FUNCNAME[1]}$(tput sgr0)" +} + +testCreateOperator() { + info + [ "$CRD" = "1" ] && FOO="-crd" || FOO="" + os::cmd::expect_success_and_text "${BIN} create -f $DIR/../manifest/operator$FOO.yaml" '"?spark-operator"? created' && \ + os::cmd::try_until_text "${BIN} get pod -l app.kubernetes.io/name=spark-operator -o yaml" 'ready: true' + if [ "$CRD" = "1" ]; then + os::cmd::try_until_text "${BIN} get crd" 'sparkclusters.radanalytics.io' + fi + sleep 10 + export operator_pod=`${BIN} get pod -l app.kubernetes.io/name=spark-operator -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'` +} + +testCreateCluster() { + info + [ "$CRD" = "1" ] && FOO="-cr" || FOO="" + os::cmd::expect_success_and_text "${BIN} create -f $DIR/../examples/cluster$FOO.yaml" '"?my-spark-cluster"? created' && \ + os::cmd::try_until_text "${BIN} get pod -l radanalytics.io/deployment=my-spark-cluster-w -o yaml" 'ready: true' && \ + os::cmd::try_until_text "${BIN} get pod -l radanalytics.io/deployment=my-spark-cluster-m -o yaml" 'ready: true' +} + +testKillOperator() { + info + os::cmd::expect_success_and_text "${BIN} delete pod $operator_pod" 'todo: output' && \ + sleep 2 && \ + testCreateOperator +} + +testScaleCluster() { + info + if [ "$CRD" = "1" ]; then + os::cmd::expect_success_and_text '${BIN} patch sparkcluster my-spark-cluster -p "{\"spec\":{\"worker\": {\"replicas\": 1}}}" --type=merge' '"?my-spark-cluster"? patched' || errorLogs + else + os::cmd::expect_success_and_text '${BIN} patch cm my-spark-cluster -p "{\"data\":{\"config\": \"worker:\n replicas: 1\"}}"' '"?my-spark-cluster"? patched' || errorLogs + fi + os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/sparkcluster=my-spark-cluster | wc -l" '2' +} + +testDeleteCluster() { + info + os::cmd::expect_success_and_text '${BIN} delete ${KIND} my-spark-cluster' '"?my-spark-cluster"? deleted' && \ + os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/sparkcluster=my-spark-cluster 2> /dev/null | wc -l" '0' +} + +testApp() { + info + [ "$CRD" = "1" ] && FOO="test/cr/" || FOO="" + os::cmd::expect_success_and_text '${BIN} create -f examples/${FOO}app.yaml' '"?my-spark-app"? created' && \ + os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/app=my-spark-app 2> /dev/null | wc -l" '3' +} + +testAppResult() { + info + sleep 2 + local driver_pod=`${BIN} get pods --no-headers -l radanalytics.io/app=my-spark-app -l spark-role=driver -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'` && \ + os::cmd::try_until_text "${BIN} logs $driver_pod" 'Pi is roughly 3.1' +} + +testDeleteApp() { + info + [ "$CRD" = "1" ] && FOO="app" || FOO="cm" + os::cmd::expect_success_and_text '${BIN} delete ${FOO} my-spark-app' '"my-spark-app" deleted' && \ + os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/app=my-spark-app 2> /dev/null | wc -l" '0' +} + +run_tests() { + testKillOperator || errorLogs + testCreateCluster || errorLogs + testKillOperator || errorLogs + testScaleCluster || errorLogs + testKillOperator || errorLogs + testDeleteCluster || errorLogs + testKillOperator || errorLogs + + sleep 10 + testApp || errorLogs + testKillOperator || errorLogs + testAppResult || errorLogs + logs +} + +main() { + export total=17 + export testIndex=0 + tear_down + setup_testing_framework + os::test::junit::declare_suite_start "operator/tests-restarts" + cluster_up + testCreateOperator || { ${BIN} get events; ${BIN} get pods; exit 1; } + if [ "$#" -gt 0 ]; then + # run single test that is passed as arg + $1 + else + run_tests + fi + os::test::junit::declare_suite_end + tear_down +} + +main $@ From 83572c78b8ee678d73b1583cb71c5c6f5c270e14 Mon Sep 17 00:00:00 2001 From: Jirka Kremser <jkremser@redhat.com> Date: Mon, 15 Oct 2018 15:32:08 +0200 Subject: [PATCH 02/11] Names for travis jobs and skipping the default script for deploy stage --- .travis.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5f87c7f9..162107e4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -44,7 +44,8 @@ jobs: env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2 script: *kc-script-defaults - - stage: test-restarts + - stage: + name: "Restart pod [oc cm]" env: BIN=oc VERSION=v3.9.0 CRD=0 script: &oc-restarts-script-defaults - make build-travis @@ -52,10 +53,12 @@ jobs: - ./.travis/.travis.test-restarts.sh - stage: + name: "Restart pod [oc crd]" env: BIN=oc VERSION=v3.9.0 CRD=1 script: *oc-restarts-script-defaults - stage: + name: "Restart pod [kc cm]" env: BIN=kubectl VERSION=v1.9.0 CRD=0 MINIKUBE_VERSION=v0.25.2 script: &kc-restarts-script-defaults - make build-travis @@ -63,11 +66,13 @@ jobs: - ./.travis/.travis.test-restarts.sh - stage: + name: "Restart pod [kc crd]" env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2 script: *kc-restarts-script-defaults - stage: deploy - script: + script: skip + deploy: # release x.y.z or x.y.z-centos if there is a release # or release the latest image if building the master branch - ./.travis/.travis.release.images.sh From a32f2dc8c2ada9b06fdf50f3a182092e2f3f686f Mon Sep 17 00:00:00 2001 From: Jirka Kremser <jkremser@redhat.com> Date: Tue, 20 Nov 2018 12:06:09 +0100 Subject: [PATCH 03/11] name for all stages; fix test --- .travis.yml | 24 +++++++++--------------- .travis/.travis.test-restarts.sh | 5 ++--- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/.travis.yml b/.travis.yml index 162107e4..91cdf513 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,7 @@ jobs: script: make build-travis test - stage: test-oc-and-k8s + name: "Specs [oc ⢠CMs]" env: BIN=oc VERSION=v3.9.0 CRD=0 script: &oc-script-defaults - make build-travis @@ -26,6 +27,7 @@ jobs: - ./.travis/.travis.test-oc-and-k8s.sh - stage: + name: "Specs [oc ⢠CRs]" env: BIN=oc VERSION=v3.9.0 CRD=1 script: *oc-script-defaults @@ -34,6 +36,7 @@ jobs: # script: *oc-script-defaults - stage: + name: "Specs [K8s ⢠CMs]" env: BIN=kubectl VERSION=v1.9.0 CRD=0 MINIKUBE_VERSION=v0.25.2 script: &kc-script-defaults - make build-travis @@ -41,35 +44,26 @@ jobs: - ./.travis/.travis.test-oc-and-k8s.sh - stage: + name: "Specs [K8s ⢠CRs]" env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2 script: *kc-script-defaults - stage: - name: "Restart pod [oc cm]" + name: "Restarts [oc ⢠CMs]" env: BIN=oc VERSION=v3.9.0 CRD=0 - script: &oc-restarts-script-defaults + script: - make build-travis - ./.travis/.travis.prepare.openshift.sh - ./.travis/.travis.test-restarts.sh - stage: - name: "Restart pod [oc crd]" - env: BIN=oc VERSION=v3.9.0 CRD=1 - script: *oc-restarts-script-defaults - - - stage: - name: "Restart pod [kc cm]" - env: BIN=kubectl VERSION=v1.9.0 CRD=0 MINIKUBE_VERSION=v0.25.2 - script: &kc-restarts-script-defaults + name: "Restarts [K8s ⢠CRs]" + env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2 + script: - make build-travis - ./.travis/.travis.prepare.minikube.sh - ./.travis/.travis.test-restarts.sh - - stage: - name: "Restart pod [kc crd]" - env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2 - script: *kc-restarts-script-defaults - - stage: deploy script: skip deploy: diff --git a/.travis/.travis.test-restarts.sh b/.travis/.travis.test-restarts.sh index bf16a331..fd9b2165 100755 --- a/.travis/.travis.test-restarts.sh +++ b/.travis/.travis.test-restarts.sh @@ -91,9 +91,8 @@ testCreateCluster() { testKillOperator() { info - os::cmd::expect_success_and_text "${BIN} delete pod $operator_pod" 'todo: output' && \ - sleep 2 && \ - testCreateOperator + os::cmd::expect_success_and_text "${BIN} delete pod $operator_pod" 'pod "?'$operator_pod'"? deleted' && \ + sleep 7 } testScaleCluster() { From 9798644ec380026f4c7b47e0f8c5b003cabdcb43 Mon Sep 17 00:00:00 2001 From: Jirka Kremser <jkremser@redhat.com> Date: Tue, 20 Nov 2018 12:32:02 +0100 Subject: [PATCH 04/11] Operator pod needs to be retrieved again after the restart --- .travis/.travis.test-restarts.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis/.travis.test-restarts.sh b/.travis/.travis.test-restarts.sh index fd9b2165..a6254ff0 100755 --- a/.travis/.travis.test-restarts.sh +++ b/.travis/.travis.test-restarts.sh @@ -78,7 +78,6 @@ testCreateOperator() { os::cmd::try_until_text "${BIN} get crd" 'sparkclusters.radanalytics.io' fi sleep 10 - export operator_pod=`${BIN} get pod -l app.kubernetes.io/name=spark-operator -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'` } testCreateCluster() { @@ -91,7 +90,8 @@ testCreateCluster() { testKillOperator() { info - os::cmd::expect_success_and_text "${BIN} delete pod $operator_pod" 'pod "?'$operator_pod'"? deleted' && \ + local pod=`${BIN} get pod -l app.kubernetes.io/name=spark-operator -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'` + os::cmd::expect_success_and_text "${BIN} delete pod $pod" 'pod "?'$pod'"? deleted' && \ sleep 7 } From aa66a9c8ece6b5870f5fbe4da820eeb40fdf905f Mon Sep 17 00:00:00 2001 From: Jirka Kremser <jkremser@redhat.com> Date: Tue, 20 Nov 2018 13:00:50 +0100 Subject: [PATCH 05/11] app -> sparkapplication --- .travis.yml | 1 + .travis/.travis.test-restarts.sh | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 91cdf513..b965e685 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,7 @@ stages: jobs: include: - stage: test + name: "Maven & cont. image build" language: java script: make build-travis test diff --git a/.travis/.travis.test-restarts.sh b/.travis/.travis.test-restarts.sh index a6254ff0..f09b9162 100755 --- a/.travis/.travis.test-restarts.sh +++ b/.travis/.travis.test-restarts.sh @@ -98,9 +98,9 @@ testKillOperator() { testScaleCluster() { info if [ "$CRD" = "1" ]; then - os::cmd::expect_success_and_text '${BIN} patch sparkcluster my-spark-cluster -p "{\"spec\":{\"worker\": {\"replicas\": 1}}}" --type=merge' '"?my-spark-cluster"? patched' || errorLogs + os::cmd::expect_success_and_text '${BIN} patch sparkcluster my-spark-cluster -p "{\"spec\":{\"worker\": {\"instances\": 1}}}" --type=merge' '"?my-spark-cluster"? patched' || errorLogs else - os::cmd::expect_success_and_text '${BIN} patch cm my-spark-cluster -p "{\"data\":{\"config\": \"worker:\n replicas: 1\"}}"' '"?my-spark-cluster"? patched' || errorLogs + os::cmd::expect_success_and_text '${BIN} patch cm my-spark-cluster -p "{\"data\":{\"config\": \"worker:\n instances: 1\"}}"' '"?my-spark-cluster"? patched' || errorLogs fi os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/sparkcluster=my-spark-cluster | wc -l" '2' } @@ -115,21 +115,21 @@ testApp() { info [ "$CRD" = "1" ] && FOO="test/cr/" || FOO="" os::cmd::expect_success_and_text '${BIN} create -f examples/${FOO}app.yaml' '"?my-spark-app"? created' && \ - os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/app=my-spark-app 2> /dev/null | wc -l" '3' + os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/sparkapplication=my-spark-app 2> /dev/null | wc -l" '3' } testAppResult() { info sleep 2 - local driver_pod=`${BIN} get pods --no-headers -l radanalytics.io/app=my-spark-app -l spark-role=driver -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'` && \ + local driver_pod=`${BIN} get pods --no-headers -l radanalytics.io/sparkapplication=my-spark-app -l spark-role=driver -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'` && \ os::cmd::try_until_text "${BIN} logs $driver_pod" 'Pi is roughly 3.1' } testDeleteApp() { info - [ "$CRD" = "1" ] && FOO="app" || FOO="cm" + [ "$CRD" = "1" ] && FOO="sparkapplication" || FOO="cm" os::cmd::expect_success_and_text '${BIN} delete ${FOO} my-spark-app' '"my-spark-app" deleted' && \ - os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/app=my-spark-app 2> /dev/null | wc -l" '0' + os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/sparkapplication=my-spark-app 2> /dev/null | wc -l" '0' } run_tests() { From 4b4106d1b6147b900c3545de2c690e5fe669c994 Mon Sep 17 00:00:00 2001 From: Jirka Kremser <jkremser@redhat.com> Date: Tue, 20 Nov 2018 15:57:35 +0100 Subject: [PATCH 06/11] tests: redefine operator_pod for logs --- .travis/.travis.test-restarts.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.travis/.travis.test-restarts.sh b/.travis/.travis.test-restarts.sh index f09b9162..aa141b73 100755 --- a/.travis/.travis.test-restarts.sh +++ b/.travis/.travis.test-restarts.sh @@ -78,6 +78,7 @@ testCreateOperator() { os::cmd::try_until_text "${BIN} get crd" 'sparkclusters.radanalytics.io' fi sleep 10 + export operator_pod=`${BIN} get pod -l app.kubernetes.io/name=spark-operator -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'` } testCreateCluster() { @@ -90,9 +91,9 @@ testCreateCluster() { testKillOperator() { info - local pod=`${BIN} get pod -l app.kubernetes.io/name=spark-operator -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'` - os::cmd::expect_success_and_text "${BIN} delete pod $pod" 'pod "?'$pod'"? deleted' && \ - sleep 7 + os::cmd::expect_success_and_text "${BIN} delete pod $operator_pod" 'pod "?'$operator_pod'"? deleted' && \ + sleep 10 + export operator_pod=`${BIN} get pod -l app.kubernetes.io/name=spark-operator -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'` } testScaleCluster() { From 9ab7e26b47ba4693534fe858e715a4ab62e47090 Mon Sep 17 00:00:00 2001 From: Jirka Kremser <jkremser@redhat.com> Date: Tue, 20 Nov 2018 16:44:25 +0100 Subject: [PATCH 07/11] Fix: NPE that was found thanks to travis CI --- .../io/radanalytics/operator/cluster/SparkClusterOperator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java b/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java index f735eef1..781916da 100644 --- a/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java +++ b/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java @@ -119,7 +119,7 @@ public void fullReconciliation() { Integer actualWorkers = actual.get(dCluster.getName()); if (actualWorkers != null && desiredWorkers != actualWorkers) { // update the internal representation with the actual # of workers - Optional.ofNullable(clusters.getCluster(dCluster.getName()).getWorker()) + Optional.ofNullable(clusters.getCluster(dCluster.getName())).map(SparkCluster::getWorker) .ifPresent(worker -> worker.setInstances(actualWorkers)); onModify(dCluster); } From 43f8a3bcce04aaa61924b87e8dc6eb2e9087e19a Mon Sep 17 00:00:00 2001 From: Jirka Kremser <jkremser@redhat.com> Date: Wed, 21 Nov 2018 13:10:30 +0100 Subject: [PATCH 08/11] Running clusters # can't be negative (in case of full reconciliation and edge case) --- .../io/radanalytics/operator/cluster/RunningClusters.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java b/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java index 5b2e777d..a17ad656 100644 --- a/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java +++ b/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java @@ -43,9 +43,11 @@ public void put(SparkCluster ci) { } public void delete(String name) { - runningClusters.dec(); - workers.labels(name).set(0); - clusters.remove(name); + if (clusters.containsKey(name)) { + runningClusters.dec(); + workers.labels(name).set(0); + clusters.remove(name); + } } public SparkCluster getCluster(String name) { From 2574e839a92ab81204a1bee59943bb6a4931d836 Mon Sep 17 00:00:00 2001 From: Jirka Kremser <jkremser@redhat.com> Date: Wed, 21 Nov 2018 15:16:45 +0100 Subject: [PATCH 09/11] Full reconciliation for scaling after operator restart and the internal representation of the state was lost. --- .../operator/cluster/RunningClusters.java | 8 +++- .../cluster/SparkClusterOperator.java | 38 +++++++++++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java b/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java index a17ad656..fa63bd8a 100644 --- a/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java +++ b/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java @@ -23,7 +23,7 @@ public class RunningClusters { .labelNames("cluster") .register(); - public static final Counter startedTotal = Counter.build() + public static final Gauge startedTotal = Gauge.build() .name("operator_started_total") .help("Spark clusters has been started by operator.") .register(); @@ -54,4 +54,10 @@ public SparkCluster getCluster(String name) { return this.clusters.get(name); } + public void resetMetrics() { + startedTotal.set(0); + workers.clear(); + startedTotal.set(0); + } + } diff --git a/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java b/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java index 781916da..5f544081 100644 --- a/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java +++ b/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java @@ -1,5 +1,6 @@ package io.radanalytics.operator.cluster; +import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Functions; import com.google.common.collect.Sets; import io.fabric8.kubernetes.api.model.DoneableReplicationController; @@ -18,7 +19,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.util.*; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; import static io.radanalytics.operator.common.AnsiColors.*; @@ -82,6 +85,7 @@ public void fullReconciliation() { // 5. modify / scale log.info("Running full reconciliation for namespace {} and kind {}..", namespace, entityName); + final AtomicBoolean change = new AtomicBoolean(false); Set<SparkCluster> desiredSet = super.getDesiredSet(); Map<String, SparkCluster> desiredMap = desiredSet.stream().collect(Collectors.toMap(SparkCluster::getName, Functions.identity())); Map<String, Integer> actual = getActual(); @@ -94,9 +98,11 @@ public void fullReconciliation() { if (!toBeCreated.isEmpty()) { log.info("toBeCreated: {}", toBeCreated); + change.set(true); } if (!toBeDeleted.isEmpty()) { log.info("toBeDeleted: {}", toBeDeleted); + change.set(true); } // add new @@ -118,12 +124,38 @@ public void fullReconciliation() { int desiredWorkers = Optional.ofNullable(dCluster.getWorker()).orElse(new RCSpec()).getInstances(); Integer actualWorkers = actual.get(dCluster.getName()); if (actualWorkers != null && desiredWorkers != actualWorkers) { - // update the internal representation with the actual # of workers - Optional.ofNullable(clusters.getCluster(dCluster.getName())).map(SparkCluster::getWorker) - .ifPresent(worker -> worker.setInstances(actualWorkers)); + change.set(true); + // update the internal representation with the actual # of workers and call onModify + if (clusters.getCluster(dCluster.getName()) == null) { + // deep copy via json -> room for optimization + ObjectMapper om = new ObjectMapper(); + try { + SparkCluster actualCluster = om.readValue(om.writeValueAsString(dCluster), SparkCluster.class); + Optional.ofNullable(actualCluster.getWorker()).ifPresent(w -> w.setInstances(actualWorkers)); + clusters.put(actualCluster); + } catch (IOException e) { + log.warn(e.getMessage()); + e.printStackTrace(); + return; + } + } else { + Optional.ofNullable(clusters.getCluster(dCluster.getName())).map(SparkCluster::getWorker) + .ifPresent(worker -> worker.setInstances(actualWorkers)); + } + log.info("scaling cluster {}", dCluster.getName()); onModify(dCluster); } }); + + // first reconciliation after (re)start -> update the clusters instance + if (!fullReconciliationRun) { + clusters.resetMetrics(); + desiredMap.entrySet().forEach(e -> clusters.put(e.getValue())); + } + + if (!change.get()) { + log.info("no change was detected during the reconciliation"); + } } private Map<String, Integer> getActual() { From 9a687b7cb3f42aa8edaaa20df7678c0951314726 Mon Sep 17 00:00:00 2001 From: Jirka Kremser <jkremser@redhat.com> Date: Wed, 21 Nov 2018 16:21:32 +0100 Subject: [PATCH 10/11] travis.yml: lint --- .travis.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index b965e685..a61e3e59 100644 --- a/.travis.yml +++ b/.travis.yml @@ -53,17 +53,17 @@ jobs: name: "Restarts [oc ⢠CMs]" env: BIN=oc VERSION=v3.9.0 CRD=0 script: - - make build-travis - - ./.travis/.travis.prepare.openshift.sh - - ./.travis/.travis.test-restarts.sh + - make build-travis + - ./.travis/.travis.prepare.openshift.sh + - ./.travis/.travis.test-restarts.sh - stage: name: "Restarts [K8s ⢠CRs]" env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2 script: - - make build-travis - - ./.travis/.travis.prepare.minikube.sh - - ./.travis/.travis.test-restarts.sh + - make build-travis + - ./.travis/.travis.prepare.minikube.sh + - ./.travis/.travis.test-restarts.sh - stage: deploy script: skip From 73a98e5562002f466adb78782d6243ff4253886b Mon Sep 17 00:00:00 2001 From: Jirka Kremser <jkremser@redhat.com> Date: Wed, 21 Nov 2018 17:14:23 +0100 Subject: [PATCH 11/11] travis.yml: fix the syntax --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index a61e3e59..674ad637 100644 --- a/.travis.yml +++ b/.travis.yml @@ -66,8 +66,8 @@ jobs: - ./.travis/.travis.test-restarts.sh - stage: deploy - script: skip - deploy: + name: "Push container images" + script: # release x.y.z or x.y.z-centos if there is a release # or release the latest image if building the master branch - ./.travis/.travis.release.images.sh