Create health condition for degraded status / quorum issues (#360)

* Add DatacenterHealth condition * Modify the DatacenterHealth status update to occur on the starting nodes part * Fix rebase * Add description to DatacenterHealthy * Add lint to the Makefile, remove unused isDegraded (cherry picked from commit b498b7e)
k8ssandra · Jul 18, 2022 · 5215c1c · 5215c1c
1 parent ea143a9
commit 5215c1c
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti
 
 ## unreleased
 
+* [ENHANCEMENT] [#360](https://github.com/k8ssandra/cass-operator/pull/360) If Datacenter quorum reports unhealthy state, change Status Condition DatacenterHealthy to False (DBPE-2283)
 * [BUGFIX] [#355](https://github.com/k8ssandra/cass-operator/issues/335) Cleanse label values derived from cluster name, which can contain illegal chars.
 * [BUGFIX] [#330](https://github.com/k8ssandra/cass-operator/issues/330) Apply correct updates to Service labels and annotations through additionalServiceConfig (they are now validated and don't allow reserved prefixes).
 * [BUGFIX] [#368](https://github.com/k8ssandra/cass-operator/issues/368) Do not fetch endpointStatus from pods that have not started

diff --git a/apis/cassandra/v1beta1/cassandradatacenter_types.go b/apis/cassandra/v1beta1/cassandradatacenter_types.go
@@ -328,6 +328,10 @@ const (
 	DatacenterRollingRestart DatacenterConditionType = "RollingRestart"
 	DatacenterValid          DatacenterConditionType = "Valid"
 	DatacenterDecommission   DatacenterConditionType = "Decommission"
+
+	// DatacenterHealthy indicates if QUORUM can be reached from all deployed nodes.
+	// If this check fails, certain operations such as scaling up will not proceed.
+	DatacenterHealthy DatacenterConditionType = "Healthy"
 )
 
 type DatacenterCondition struct {

diff --git a/pkg/reconciliation/reconcile_racks.go b/pkg/reconciliation/reconcile_racks.go
@@ -623,12 +623,16 @@ func (rc *ReconciliationContext) CheckPodsReady(endpointData httphelper.CassMeta
 
 	// step 3 - get all nodes up
 	// if the cluster isn't healthy, that's ok, but go back to step 1
-	if !rc.isClusterHealthy() {
+	clusterHealthy := rc.isClusterHealthy()
+	if err := rc.updateHealth(clusterHealthy); err != nil {
+		return result.Error(err)
+	}
+
+	if !clusterHealthy {
 		rc.ReqLogger.Info(
 			"cluster isn't healthy",
 		)
-		// FIXME this is one spot I've seen get spammy, should we raise this number?
-		return result.RequeueSoon(2)
+		return result.RequeueSoon(5)
 	}
 
 	needsMoreNodes, err := rc.startAllNodes(endpointData)
@@ -1179,6 +1183,30 @@ func (rc *ReconciliationContext) UpdateStatus() result.ReconcileResult {
 	return result.Continue()
 }
 
+func (rc *ReconciliationContext) updateHealth(healthy bool) error {
+	updated := false
+	dcPatch := client.MergeFrom(rc.Datacenter.DeepCopy())
+
+	if !healthy {
+		updated = rc.setCondition(
+			api.NewDatacenterCondition(
+				api.DatacenterHealthy, corev1.ConditionFalse))
+	} else {
+		updated = rc.setCondition(
+			api.NewDatacenterCondition(
+				api.DatacenterHealthy, corev1.ConditionTrue))
+	}
+
+	if updated {
+		err := rc.Client.Status().Patch(rc.Ctx, rc.Datacenter, dcPatch)
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
 func hasBeenXMinutes(x int, sinceTime time.Time) bool {
 	xMinutesAgo := time.Now().Add(time.Minute * time.Duration(-x))
 	return sinceTime.Before(xMinutesAgo)
@@ -1307,6 +1335,8 @@ func (rc *ReconciliationContext) deleteStuckNodes() (bool, error) {
 	return false, nil
 }
 
+// isClusterHealthy does a LOCAL_QUORUM query to the Cassandra pods and returns true if all the pods were able to
+// respond without error.
 func (rc *ReconciliationContext) isClusterHealthy() bool {
 	pods := FilterPodListByCassNodeState(rc.clusterPods, stateStarted)