k8ssandra · burmanm · Jul 13, 2022 · Jul 5, 2022 · Jul 6, 2022 · Jul 6, 2022
@@ -10,8 +10,8 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti
 ```
 
 ## unreleased
+* [ENHANCEMENT] [#360](https://github.com/k8ssandra/cass-operator/pull/360) If Datacenter quorum reports unhealthy state, change Status Condition DatacenterHealthy to False (DBPE-2283)
 * [BUGFIX] [#355](https://github.com/k8ssandra/cass-operator/issues/335) Cleanse label values derived from cluster name, which can contain illegal chars. Include app.kubernetes.io/created-by label. 
-
 * [BUGFIX] [#330](https://github.com/k8ssandra/cass-operator/issues/330) Apply correct updates to Service labels and annotations through additionalServiceConfig (they are now validated and don't allow reserved prefixes).
 
 ## v1.11.0

@@ -119,7 +119,7 @@ lint: ## Run golangci-lint against code.
 	golangci-lint run ./...
 
 .PHONY: test
-test: manifests generate fmt vet envtest ## Run tests.
+test: manifests generate fmt vet lint envtest ## Run tests.
 	# Old unit tests first - these use mocked client / fakeclient
 	go test ./pkg/... -coverprofile cover-pkg.out
 	# Then the envtest ones

@@ -328,6 +328,10 @@ const (
 	DatacenterRollingRestart DatacenterConditionType = "RollingRestart"
 	DatacenterValid          DatacenterConditionType = "Valid"
 	DatacenterDecommission   DatacenterConditionType = "Decommission"
+
+	// DatacenterHealthy indicates if QUORUM can be reached from all deployed nodes.
+	// If this check fails, certain operations such as scaling up will not proceed.
+	DatacenterHealthy DatacenterConditionType = "Healthy"
 )
 
 type DatacenterCondition struct {

@@ -623,12 +623,16 @@ func (rc *ReconciliationContext) CheckPodsReady(endpointData httphelper.CassMeta
 
 	// step 3 - get all nodes up
 	// if the cluster isn't healthy, that's ok, but go back to step 1
-	if !rc.isClusterHealthy() {
+	clusterHealthy := rc.isClusterHealthy()
+	if err := rc.updateHealth(clusterHealthy); err != nil {
+		return result.Error(err)
+	}
+
+	if !clusterHealthy {
 		rc.ReqLogger.Info(
 			"cluster isn't healthy",
 		)
-		// FIXME this is one spot I've seen get spammy, should we raise this number?
-		return result.RequeueSoon(2)
+		return result.RequeueSoon(5)
 	}
 
 	needsMoreNodes, err := rc.startAllNodes(endpointData)
@@ -1161,6 +1165,30 @@ func (rc *ReconciliationContext) UpdateStatus() result.ReconcileResult {
 	return result.Continue()
 }
 
+func (rc *ReconciliationContext) updateHealth(healthy bool) error {
+	updated := false
+	dcPatch := client.MergeFrom(rc.Datacenter.DeepCopy())
+
+	if !healthy {
+		updated = rc.setCondition(
+			api.NewDatacenterCondition(
+				api.DatacenterHealthy, corev1.ConditionFalse))
+	} else {
+		updated = rc.setCondition(
+			api.NewDatacenterCondition(
+				api.DatacenterHealthy, corev1.ConditionTrue))
+	}
+
+	if updated {
+		err := rc.Client.Status().Patch(rc.Ctx, rc.Datacenter, dcPatch)
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
 func hasBeenXMinutes(x int, sinceTime time.Time) bool {
 	xMinutesAgo := time.Now().Add(time.Minute * time.Duration(-x))
 	return sinceTime.Before(xMinutesAgo)
@@ -1279,6 +1307,8 @@ func (rc *ReconciliationContext) deleteStuckNodes() (bool, error) {
 	return false, nil
 }
 
+// isClusterHealthy does a LOCAL_QUORUM query to the Cassandra pods and returns true if all the pods were able to
+// respond without error.
 func (rc *ReconciliationContext) isClusterHealthy() bool {
 	pods := FilterPodListByCassNodeState(rc.clusterPods, stateStarted)