From 477fdebc953a076f044a304a03dec920ebd72bed Mon Sep 17 00:00:00 2001
From: Jirka Kremser <jkremser@redhat.com>
Date: Fri, 12 Oct 2018 20:28:34 +0200
Subject: [PATCH 01/11] New stage for testing the operator restarts

---
 .travis.yml                      |  23 +++++
 .travis/.travis.test-restarts.sh | 170 +++++++++++++++++++++++++++++++
 2 files changed, 193 insertions(+)
 create mode 100755 .travis/.travis.test-restarts.sh

diff --git a/.travis.yml b/.travis.yml
index 18d8f82e..5f87c7f9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,6 +9,7 @@ install: true
 stages:
   - test
   - test-oc-and-k8s
+  - test-restarts
   - deploy
 
 jobs:
@@ -43,6 +44,28 @@ jobs:
       env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2
       script: *kc-script-defaults
 
+    - stage: test-restarts
+      env: BIN=oc VERSION=v3.9.0 CRD=0
+      script: &oc-restarts-script-defaults
+      - make build-travis
+      - ./.travis/.travis.prepare.openshift.sh
+      - ./.travis/.travis.test-restarts.sh
+
+    - stage:
+      env: BIN=oc VERSION=v3.9.0 CRD=1
+      script: *oc-restarts-script-defaults
+
+    - stage:
+      env: BIN=kubectl VERSION=v1.9.0 CRD=0 MINIKUBE_VERSION=v0.25.2
+      script: &kc-restarts-script-defaults
+      - make build-travis
+      - ./.travis/.travis.prepare.minikube.sh
+      - ./.travis/.travis.test-restarts.sh
+
+    - stage:
+      env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2
+      script: *kc-restarts-script-defaults
+
     - stage: deploy
       script:
         # release x.y.z or x.y.z-centos if there is a release
diff --git a/.travis/.travis.test-restarts.sh b/.travis/.travis.test-restarts.sh
new file mode 100755
index 00000000..bf16a331
--- /dev/null
+++ b/.travis/.travis.test-restarts.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+
+DIR="${DIR:-$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )}"
+BIN=${BIN:-oc}
+if [ "$CRD" = "1" ]; then
+  CR="cr/"
+  KIND="sparkcluster"
+else
+  CR=""
+  KIND="cm"
+fi
+
+cluster_up() {
+  echo -e "\n$(tput setaf 3)docker images:$(tput sgr0)\n"
+  docker images
+  echo
+  if [ "$BIN" = "oc" ]; then
+    set -x
+    oc cluster up
+    [ "$CRD" = "1" ] && oc login -u system:admin
+    set +x
+  else
+    echo "minikube"
+    start_minikube
+  fi
+}
+
+start_minikube() {
+  export CHANGE_MINIKUBE_NONE_USER=true
+  sudo minikube start --vm-driver=none --kubernetes-version=${VERSION} && \
+  minikube update-context
+  os::cmd::try_until_text "${BIN} get nodes" '\sReady'
+
+  kubectl cluster-info
+
+
+  # kube-addon-manager is responsible for managing other k8s components, such as kube-dns, dashboard, storage-provisioner..
+  os::cmd::try_until_text "${BIN} -n kube-system get pod -lcomponent=kube-addon-manager -o yaml" 'ready: true'
+
+  # Wait for kube-dns to be ready.
+  os::cmd::try_until_text "${BIN} -n kube-system get pod -lk8s-app=kube-dns -o yaml" 'ready: true'
+}
+
+tear_down() {
+  docker kill `docker ps -q` || true
+}
+
+setup_testing_framework() {
+  source "$(dirname "${BASH_SOURCE}")/../test/lib/init.sh"
+  os::util::environment::setup_time_vars
+}
+
+logs() {
+  echo -e "\n$(tput setaf 3)oc get all:$(tput sgr0)\n"
+  ${BIN} get all
+  echo -e "\n$(tput setaf 3)Logs:$(tput sgr0)\n"
+  ${BIN} logs $operator_pod
+  echo
+}
+
+errorLogs() {
+  echo -e "\n\n$(tput setaf 1)\n  😱 😱 😱\nBUILD FAILED\n\n😱 bad things have happened 😱$(tput sgr0)"
+  logs
+  exit 1
+}
+
+info() {
+  ((testIndex++))
+  echo "$(tput setaf 3)[$testIndex / $total] - Running ${FUNCNAME[1]}$(tput sgr0)"
+}
+
+testCreateOperator() {
+  info
+  [ "$CRD" = "1" ] && FOO="-crd" || FOO=""
+  os::cmd::expect_success_and_text "${BIN} create -f $DIR/../manifest/operator$FOO.yaml" '"?spark-operator"? created' && \
+  os::cmd::try_until_text "${BIN} get pod -l app.kubernetes.io/name=spark-operator -o yaml" 'ready: true'
+  if [ "$CRD" = "1" ]; then
+    os::cmd::try_until_text "${BIN} get crd" 'sparkclusters.radanalytics.io'
+  fi
+  sleep 10
+  export operator_pod=`${BIN} get pod -l app.kubernetes.io/name=spark-operator -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'`
+}
+
+testCreateCluster() {
+  info
+  [ "$CRD" = "1" ] && FOO="-cr" || FOO=""
+  os::cmd::expect_success_and_text "${BIN} create -f $DIR/../examples/cluster$FOO.yaml" '"?my-spark-cluster"? created' && \
+  os::cmd::try_until_text "${BIN} get pod -l radanalytics.io/deployment=my-spark-cluster-w -o yaml" 'ready: true' && \
+  os::cmd::try_until_text "${BIN} get pod -l radanalytics.io/deployment=my-spark-cluster-m -o yaml" 'ready: true'
+}
+
+testKillOperator() {
+  info
+  os::cmd::expect_success_and_text "${BIN} delete pod $operator_pod" 'todo: output' && \
+  sleep 2 && \
+  testCreateOperator
+}
+
+testScaleCluster() {
+  info
+  if [ "$CRD" = "1" ]; then
+    os::cmd::expect_success_and_text '${BIN} patch sparkcluster my-spark-cluster -p "{\"spec\":{\"worker\": {\"replicas\": 1}}}" --type=merge' '"?my-spark-cluster"? patched' || errorLogs
+  else
+    os::cmd::expect_success_and_text '${BIN} patch cm my-spark-cluster -p "{\"data\":{\"config\": \"worker:\n  replicas: 1\"}}"' '"?my-spark-cluster"? patched' || errorLogs
+  fi
+  os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/sparkcluster=my-spark-cluster | wc -l" '2'
+}
+
+testDeleteCluster() {
+  info
+  os::cmd::expect_success_and_text '${BIN} delete ${KIND} my-spark-cluster' '"?my-spark-cluster"? deleted' && \
+  os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/sparkcluster=my-spark-cluster 2> /dev/null | wc -l" '0'
+}
+
+testApp() {
+  info
+  [ "$CRD" = "1" ] && FOO="test/cr/" || FOO=""
+  os::cmd::expect_success_and_text '${BIN} create -f examples/${FOO}app.yaml' '"?my-spark-app"? created' && \
+  os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/app=my-spark-app 2> /dev/null | wc -l" '3'
+}
+
+testAppResult() {
+  info
+  sleep 2
+  local driver_pod=`${BIN} get pods --no-headers -l radanalytics.io/app=my-spark-app -l spark-role=driver -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'` && \
+  os::cmd::try_until_text "${BIN} logs $driver_pod" 'Pi is roughly 3.1'
+}
+
+testDeleteApp() {
+  info
+  [ "$CRD" = "1" ] && FOO="app" || FOO="cm"
+  os::cmd::expect_success_and_text '${BIN} delete ${FOO} my-spark-app' '"my-spark-app" deleted' && \
+  os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/app=my-spark-app 2> /dev/null | wc -l" '0'
+}
+
+run_tests() {
+  testKillOperator || errorLogs
+  testCreateCluster || errorLogs
+  testKillOperator || errorLogs
+  testScaleCluster || errorLogs
+  testKillOperator || errorLogs
+  testDeleteCluster || errorLogs
+  testKillOperator || errorLogs
+
+  sleep 10
+  testApp || errorLogs
+  testKillOperator || errorLogs
+  testAppResult || errorLogs
+  logs
+}
+
+main() {
+  export total=17
+  export testIndex=0
+  tear_down
+  setup_testing_framework
+  os::test::junit::declare_suite_start "operator/tests-restarts"
+  cluster_up
+  testCreateOperator || { ${BIN} get events; ${BIN} get pods; exit 1; }
+  if [ "$#" -gt 0 ]; then
+    # run single test that is passed as arg
+    $1
+  else
+    run_tests
+  fi
+  os::test::junit::declare_suite_end
+  tear_down
+}
+
+main $@

From 83572c78b8ee678d73b1583cb71c5c6f5c270e14 Mon Sep 17 00:00:00 2001
From: Jirka Kremser <jkremser@redhat.com>
Date: Mon, 15 Oct 2018 15:32:08 +0200
Subject: [PATCH 02/11] Names for travis jobs and skipping the default script
 for deploy stage

---
 .travis.yml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 5f87c7f9..162107e4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -44,7 +44,8 @@ jobs:
       env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2
       script: *kc-script-defaults
 
-    - stage: test-restarts
+    - stage:
+      name: "Restart pod [oc cm]"
       env: BIN=oc VERSION=v3.9.0 CRD=0
       script: &oc-restarts-script-defaults
       - make build-travis
@@ -52,10 +53,12 @@ jobs:
       - ./.travis/.travis.test-restarts.sh
 
     - stage:
+      name: "Restart pod [oc crd]"
       env: BIN=oc VERSION=v3.9.0 CRD=1
       script: *oc-restarts-script-defaults
 
     - stage:
+      name: "Restart pod [kc cm]"
       env: BIN=kubectl VERSION=v1.9.0 CRD=0 MINIKUBE_VERSION=v0.25.2
       script: &kc-restarts-script-defaults
       - make build-travis
@@ -63,11 +66,13 @@ jobs:
       - ./.travis/.travis.test-restarts.sh
 
     - stage:
+      name: "Restart pod [kc crd]"
       env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2
       script: *kc-restarts-script-defaults
 
     - stage: deploy
-      script:
+      script: skip
+      deploy:
         # release x.y.z or x.y.z-centos if there is a release
         # or release the latest image if building the master branch
         - ./.travis/.travis.release.images.sh

From a32f2dc8c2ada9b06fdf50f3a182092e2f3f686f Mon Sep 17 00:00:00 2001
From: Jirka Kremser <jkremser@redhat.com>
Date: Tue, 20 Nov 2018 12:06:09 +0100
Subject: [PATCH 03/11] name for all stages; fix test

---
 .travis.yml                      | 24 +++++++++---------------
 .travis/.travis.test-restarts.sh |  5 ++---
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 162107e4..91cdf513 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -19,6 +19,7 @@ jobs:
       script: make build-travis test
 
     - stage: test-oc-and-k8s
+      name: "Specs [oc • CMs]"
       env: BIN=oc VERSION=v3.9.0 CRD=0
       script: &oc-script-defaults
         - make build-travis
@@ -26,6 +27,7 @@ jobs:
         - ./.travis/.travis.test-oc-and-k8s.sh
 
     - stage:
+      name: "Specs [oc • CRs]"
       env: BIN=oc VERSION=v3.9.0 CRD=1
       script: *oc-script-defaults
 
@@ -34,6 +36,7 @@ jobs:
     #  script: *oc-script-defaults
 
     - stage:
+      name: "Specs [K8s • CMs]"
       env: BIN=kubectl VERSION=v1.9.0 CRD=0 MINIKUBE_VERSION=v0.25.2
       script: &kc-script-defaults
         - make build-travis
@@ -41,35 +44,26 @@ jobs:
         - ./.travis/.travis.test-oc-and-k8s.sh
 
     - stage:
+      name: "Specs [K8s • CRs]"
       env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2
       script: *kc-script-defaults
 
     - stage:
-      name: "Restart pod [oc cm]"
+      name: "Restarts [oc • CMs]"
       env: BIN=oc VERSION=v3.9.0 CRD=0
-      script: &oc-restarts-script-defaults
+      script:
       - make build-travis
       - ./.travis/.travis.prepare.openshift.sh
       - ./.travis/.travis.test-restarts.sh
 
     - stage:
-      name: "Restart pod [oc crd]"
-      env: BIN=oc VERSION=v3.9.0 CRD=1
-      script: *oc-restarts-script-defaults
-
-    - stage:
-      name: "Restart pod [kc cm]"
-      env: BIN=kubectl VERSION=v1.9.0 CRD=0 MINIKUBE_VERSION=v0.25.2
-      script: &kc-restarts-script-defaults
+      name: "Restarts [K8s • CRs]"
+      env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2
+      script:
       - make build-travis
       - ./.travis/.travis.prepare.minikube.sh
       - ./.travis/.travis.test-restarts.sh
 
-    - stage:
-      name: "Restart pod [kc crd]"
-      env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2
-      script: *kc-restarts-script-defaults
-
     - stage: deploy
       script: skip
       deploy:
diff --git a/.travis/.travis.test-restarts.sh b/.travis/.travis.test-restarts.sh
index bf16a331..fd9b2165 100755
--- a/.travis/.travis.test-restarts.sh
+++ b/.travis/.travis.test-restarts.sh
@@ -91,9 +91,8 @@ testCreateCluster() {
 
 testKillOperator() {
   info
-  os::cmd::expect_success_and_text "${BIN} delete pod $operator_pod" 'todo: output' && \
-  sleep 2 && \
-  testCreateOperator
+  os::cmd::expect_success_and_text "${BIN} delete pod $operator_pod" 'pod "?'$operator_pod'"? deleted' && \
+  sleep 7
 }
 
 testScaleCluster() {

From 9798644ec380026f4c7b47e0f8c5b003cabdcb43 Mon Sep 17 00:00:00 2001
From: Jirka Kremser <jkremser@redhat.com>
Date: Tue, 20 Nov 2018 12:32:02 +0100
Subject: [PATCH 04/11] Operator pod needs to be retrieved again after the
 restart

---
 .travis/.travis.test-restarts.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis/.travis.test-restarts.sh b/.travis/.travis.test-restarts.sh
index fd9b2165..a6254ff0 100755
--- a/.travis/.travis.test-restarts.sh
+++ b/.travis/.travis.test-restarts.sh
@@ -78,7 +78,6 @@ testCreateOperator() {
     os::cmd::try_until_text "${BIN} get crd" 'sparkclusters.radanalytics.io'
   fi
   sleep 10
-  export operator_pod=`${BIN} get pod -l app.kubernetes.io/name=spark-operator -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'`
 }
 
 testCreateCluster() {
@@ -91,7 +90,8 @@ testCreateCluster() {
 
 testKillOperator() {
   info
-  os::cmd::expect_success_and_text "${BIN} delete pod $operator_pod" 'pod "?'$operator_pod'"? deleted' && \
+  local pod=`${BIN} get pod -l app.kubernetes.io/name=spark-operator -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'`
+  os::cmd::expect_success_and_text "${BIN} delete pod $pod" 'pod "?'$pod'"? deleted' && \
   sleep 7
 }
 

From aa66a9c8ece6b5870f5fbe4da820eeb40fdf905f Mon Sep 17 00:00:00 2001
From: Jirka Kremser <jkremser@redhat.com>
Date: Tue, 20 Nov 2018 13:00:50 +0100
Subject: [PATCH 05/11] app -> sparkapplication

---
 .travis.yml                      |  1 +
 .travis/.travis.test-restarts.sh | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 91cdf513..b965e685 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,6 +15,7 @@ stages:
 jobs:
   include:
     - stage: test
+      name: "Maven & cont. image build"
       language: java
       script: make build-travis test
 
diff --git a/.travis/.travis.test-restarts.sh b/.travis/.travis.test-restarts.sh
index a6254ff0..f09b9162 100755
--- a/.travis/.travis.test-restarts.sh
+++ b/.travis/.travis.test-restarts.sh
@@ -98,9 +98,9 @@ testKillOperator() {
 testScaleCluster() {
   info
   if [ "$CRD" = "1" ]; then
-    os::cmd::expect_success_and_text '${BIN} patch sparkcluster my-spark-cluster -p "{\"spec\":{\"worker\": {\"replicas\": 1}}}" --type=merge' '"?my-spark-cluster"? patched' || errorLogs
+    os::cmd::expect_success_and_text '${BIN} patch sparkcluster my-spark-cluster -p "{\"spec\":{\"worker\": {\"instances\": 1}}}" --type=merge' '"?my-spark-cluster"? patched' || errorLogs
   else
-    os::cmd::expect_success_and_text '${BIN} patch cm my-spark-cluster -p "{\"data\":{\"config\": \"worker:\n  replicas: 1\"}}"' '"?my-spark-cluster"? patched' || errorLogs
+    os::cmd::expect_success_and_text '${BIN} patch cm my-spark-cluster -p "{\"data\":{\"config\": \"worker:\n  instances: 1\"}}"' '"?my-spark-cluster"? patched' || errorLogs
   fi
   os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/sparkcluster=my-spark-cluster | wc -l" '2'
 }
@@ -115,21 +115,21 @@ testApp() {
   info
   [ "$CRD" = "1" ] && FOO="test/cr/" || FOO=""
   os::cmd::expect_success_and_text '${BIN} create -f examples/${FOO}app.yaml' '"?my-spark-app"? created' && \
-  os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/app=my-spark-app 2> /dev/null | wc -l" '3'
+  os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/sparkapplication=my-spark-app 2> /dev/null | wc -l" '3'
 }
 
 testAppResult() {
   info
   sleep 2
-  local driver_pod=`${BIN} get pods --no-headers -l radanalytics.io/app=my-spark-app -l spark-role=driver -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'` && \
+  local driver_pod=`${BIN} get pods --no-headers -l radanalytics.io/sparkapplication=my-spark-app -l spark-role=driver -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'` && \
   os::cmd::try_until_text "${BIN} logs $driver_pod" 'Pi is roughly 3.1'
 }
 
 testDeleteApp() {
   info
-  [ "$CRD" = "1" ] && FOO="app" || FOO="cm"
+  [ "$CRD" = "1" ] && FOO="sparkapplication" || FOO="cm"
   os::cmd::expect_success_and_text '${BIN} delete ${FOO} my-spark-app' '"my-spark-app" deleted' && \
-  os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/app=my-spark-app 2> /dev/null | wc -l" '0'
+  os::cmd::try_until_text "${BIN} get pods --no-headers -l radanalytics.io/sparkapplication=my-spark-app 2> /dev/null | wc -l" '0'
 }
 
 run_tests() {

From 4b4106d1b6147b900c3545de2c690e5fe669c994 Mon Sep 17 00:00:00 2001
From: Jirka Kremser <jkremser@redhat.com>
Date: Tue, 20 Nov 2018 15:57:35 +0100
Subject: [PATCH 06/11] tests: redefine operator_pod for logs

---
 .travis/.travis.test-restarts.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.travis/.travis.test-restarts.sh b/.travis/.travis.test-restarts.sh
index f09b9162..aa141b73 100755
--- a/.travis/.travis.test-restarts.sh
+++ b/.travis/.travis.test-restarts.sh
@@ -78,6 +78,7 @@ testCreateOperator() {
     os::cmd::try_until_text "${BIN} get crd" 'sparkclusters.radanalytics.io'
   fi
   sleep 10
+  export operator_pod=`${BIN} get pod -l app.kubernetes.io/name=spark-operator -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'`
 }
 
 testCreateCluster() {
@@ -90,9 +91,9 @@ testCreateCluster() {
 
 testKillOperator() {
   info
-  local pod=`${BIN} get pod -l app.kubernetes.io/name=spark-operator -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'`
-  os::cmd::expect_success_and_text "${BIN} delete pod $pod" 'pod "?'$pod'"? deleted' && \
-  sleep 7
+  os::cmd::expect_success_and_text "${BIN} delete pod $operator_pod" 'pod "?'$operator_pod'"? deleted' && \
+  sleep 10
+  export operator_pod=`${BIN} get pod -l app.kubernetes.io/name=spark-operator -o='jsonpath="{.items[0].metadata.name}"' | sed 's/"//g'`
 }
 
 testScaleCluster() {

From 9ab7e26b47ba4693534fe858e715a4ab62e47090 Mon Sep 17 00:00:00 2001
From: Jirka Kremser <jkremser@redhat.com>
Date: Tue, 20 Nov 2018 16:44:25 +0100
Subject: [PATCH 07/11] Fix: NPE that was found thanks to travis CI

---
 .../io/radanalytics/operator/cluster/SparkClusterOperator.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java b/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java
index f735eef1..781916da 100644
--- a/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java
+++ b/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java
@@ -119,7 +119,7 @@ public void fullReconciliation() {
             Integer actualWorkers = actual.get(dCluster.getName());
             if (actualWorkers != null && desiredWorkers != actualWorkers) {
                 // update the internal representation with the actual # of workers
-                Optional.ofNullable(clusters.getCluster(dCluster.getName()).getWorker())
+                Optional.ofNullable(clusters.getCluster(dCluster.getName())).map(SparkCluster::getWorker)
                         .ifPresent(worker -> worker.setInstances(actualWorkers));
                 onModify(dCluster);
             }

From 43f8a3bcce04aaa61924b87e8dc6eb2e9087e19a Mon Sep 17 00:00:00 2001
From: Jirka Kremser <jkremser@redhat.com>
Date: Wed, 21 Nov 2018 13:10:30 +0100
Subject: [PATCH 08/11] Running clusters # can't be negative (in case of full
 reconciliation and edge case)

---
 .../io/radanalytics/operator/cluster/RunningClusters.java | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java b/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java
index 5b2e777d..a17ad656 100644
--- a/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java
+++ b/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java
@@ -43,9 +43,11 @@ public void put(SparkCluster ci) {
     }
 
     public void delete(String name) {
-        runningClusters.dec();
-        workers.labels(name).set(0);
-        clusters.remove(name);
+        if (clusters.containsKey(name)) {
+            runningClusters.dec();
+            workers.labels(name).set(0);
+            clusters.remove(name);
+        }
     }
 
     public SparkCluster getCluster(String name) {

From 2574e839a92ab81204a1bee59943bb6a4931d836 Mon Sep 17 00:00:00 2001
From: Jirka Kremser <jkremser@redhat.com>
Date: Wed, 21 Nov 2018 15:16:45 +0100
Subject: [PATCH 09/11] Full reconciliation for scaling after operator restart
 and the internal representation of the state was lost.

---
 .../operator/cluster/RunningClusters.java     |  8 +++-
 .../cluster/SparkClusterOperator.java         | 38 +++++++++++++++++--
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java b/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java
index a17ad656..fa63bd8a 100644
--- a/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java
+++ b/src/main/java/io/radanalytics/operator/cluster/RunningClusters.java
@@ -23,7 +23,7 @@ public class RunningClusters {
             .labelNames("cluster")
             .register();
 
-    public static final Counter startedTotal = Counter.build()
+    public static final Gauge startedTotal = Gauge.build()
             .name("operator_started_total")
             .help("Spark clusters has been started by operator.")
             .register();
@@ -54,4 +54,10 @@ public SparkCluster getCluster(String name) {
         return this.clusters.get(name);
     }
 
+    public void resetMetrics() {
+        startedTotal.set(0);
+        workers.clear();
+        startedTotal.set(0);
+    }
+
 }
diff --git a/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java b/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java
index 781916da..5f544081 100644
--- a/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java
+++ b/src/main/java/io/radanalytics/operator/cluster/SparkClusterOperator.java
@@ -1,5 +1,6 @@
 package io.radanalytics.operator.cluster;
 
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.base.Functions;
 import com.google.common.collect.Sets;
 import io.fabric8.kubernetes.api.model.DoneableReplicationController;
@@ -18,7 +19,9 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.IOException;
 import java.util.*;
+import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.stream.Collectors;
 
 import static io.radanalytics.operator.common.AnsiColors.*;
@@ -82,6 +85,7 @@ public void fullReconciliation() {
 //        5. modify / scale
 
         log.info("Running full reconciliation for namespace {} and kind {}..", namespace, entityName);
+        final AtomicBoolean change = new AtomicBoolean(false);
         Set<SparkCluster> desiredSet = super.getDesiredSet();
         Map<String, SparkCluster> desiredMap = desiredSet.stream().collect(Collectors.toMap(SparkCluster::getName, Functions.identity()));
         Map<String, Integer> actual = getActual();
@@ -94,9 +98,11 @@ public void fullReconciliation() {
 
         if (!toBeCreated.isEmpty()) {
             log.info("toBeCreated: {}", toBeCreated);
+            change.set(true);
         }
         if (!toBeDeleted.isEmpty()) {
             log.info("toBeDeleted: {}", toBeDeleted);
+            change.set(true);
         }
 
         // add new
@@ -118,12 +124,38 @@ public void fullReconciliation() {
             int desiredWorkers = Optional.ofNullable(dCluster.getWorker()).orElse(new RCSpec()).getInstances();
             Integer actualWorkers = actual.get(dCluster.getName());
             if (actualWorkers != null && desiredWorkers != actualWorkers) {
-                // update the internal representation with the actual # of workers
-                Optional.ofNullable(clusters.getCluster(dCluster.getName())).map(SparkCluster::getWorker)
-                        .ifPresent(worker -> worker.setInstances(actualWorkers));
+                change.set(true);
+                // update the internal representation with the actual # of workers and call onModify
+                if (clusters.getCluster(dCluster.getName()) == null) {
+                    // deep copy via json -> room for optimization
+                    ObjectMapper om = new ObjectMapper();
+                    try {
+                        SparkCluster actualCluster = om.readValue(om.writeValueAsString(dCluster), SparkCluster.class);
+                        Optional.ofNullable(actualCluster.getWorker()).ifPresent(w -> w.setInstances(actualWorkers));
+                        clusters.put(actualCluster);
+                    } catch (IOException e) {
+                        log.warn(e.getMessage());
+                        e.printStackTrace();
+                        return;
+                    }
+                } else {
+                    Optional.ofNullable(clusters.getCluster(dCluster.getName())).map(SparkCluster::getWorker)
+                            .ifPresent(worker -> worker.setInstances(actualWorkers));
+                }
+                log.info("scaling cluster {}", dCluster.getName());
                 onModify(dCluster);
             }
         });
+
+        // first reconciliation after (re)start -> update the clusters instance
+        if (!fullReconciliationRun) {
+            clusters.resetMetrics();
+            desiredMap.entrySet().forEach(e -> clusters.put(e.getValue()));
+        }
+
+        if (!change.get()) {
+            log.info("no change was detected during the reconciliation");
+        }
     }
 
     private Map<String, Integer> getActual() {

From 9a687b7cb3f42aa8edaaa20df7678c0951314726 Mon Sep 17 00:00:00 2001
From: Jirka Kremser <jkremser@redhat.com>
Date: Wed, 21 Nov 2018 16:21:32 +0100
Subject: [PATCH 10/11] travis.yml: lint

---
 .travis.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index b965e685..a61e3e59 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -53,17 +53,17 @@ jobs:
       name: "Restarts [oc • CMs]"
       env: BIN=oc VERSION=v3.9.0 CRD=0
       script:
-      - make build-travis
-      - ./.travis/.travis.prepare.openshift.sh
-      - ./.travis/.travis.test-restarts.sh
+        - make build-travis
+        - ./.travis/.travis.prepare.openshift.sh
+        - ./.travis/.travis.test-restarts.sh
 
     - stage:
       name: "Restarts [K8s • CRs]"
       env: BIN=kubectl VERSION=v1.9.0 CRD=1 MINIKUBE_VERSION=v0.25.2
       script:
-      - make build-travis
-      - ./.travis/.travis.prepare.minikube.sh
-      - ./.travis/.travis.test-restarts.sh
+        - make build-travis
+        - ./.travis/.travis.prepare.minikube.sh
+        - ./.travis/.travis.test-restarts.sh
 
     - stage: deploy
       script: skip

From 73a98e5562002f466adb78782d6243ff4253886b Mon Sep 17 00:00:00 2001
From: Jirka Kremser <jkremser@redhat.com>
Date: Wed, 21 Nov 2018 17:14:23 +0100
Subject: [PATCH 11/11] travis.yml: fix the syntax

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a61e3e59..674ad637 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -66,8 +66,8 @@ jobs:
         - ./.travis/.travis.test-restarts.sh
 
     - stage: deploy
-      script: skip
-      deploy:
+      name: "Push container images"
+      script:
         # release x.y.z or x.y.z-centos if there is a release
         # or release the latest image if building the master branch
         - ./.travis/.travis.release.images.sh