diff --git a/CHANGELOG.md b/CHANGELOG.md index 15be65c5..edf4cf4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti ## unreleased +* [ENHANCEMENT] [#360](https://github.com/k8ssandra/cass-operator/pull/360) If Datacenter quorum reports unhealthy state, change Status Condition DatacenterHealthy to False (DBPE-2283) * [BUGFIX] [#355](https://github.com/k8ssandra/cass-operator/issues/335) Cleanse label values derived from cluster name, which can contain illegal chars. * [BUGFIX] [#330](https://github.com/k8ssandra/cass-operator/issues/330) Apply correct updates to Service labels and annotations through additionalServiceConfig (they are now validated and don't allow reserved prefixes). * [BUGFIX] [#368](https://github.com/k8ssandra/cass-operator/issues/368) Do not fetch endpointStatus from pods that have not started diff --git a/apis/cassandra/v1beta1/cassandradatacenter_types.go b/apis/cassandra/v1beta1/cassandradatacenter_types.go index dc4337df..60471621 100644 --- a/apis/cassandra/v1beta1/cassandradatacenter_types.go +++ b/apis/cassandra/v1beta1/cassandradatacenter_types.go @@ -328,6 +328,10 @@ const ( DatacenterRollingRestart DatacenterConditionType = "RollingRestart" DatacenterValid DatacenterConditionType = "Valid" DatacenterDecommission DatacenterConditionType = "Decommission" + + // DatacenterHealthy indicates if QUORUM can be reached from all deployed nodes. + // If this check fails, certain operations such as scaling up will not proceed. + DatacenterHealthy DatacenterConditionType = "Healthy" ) type DatacenterCondition struct { diff --git a/pkg/reconciliation/reconcile_racks.go b/pkg/reconciliation/reconcile_racks.go index cb068d3c..10bdc720 100644 --- a/pkg/reconciliation/reconcile_racks.go +++ b/pkg/reconciliation/reconcile_racks.go @@ -623,12 +623,16 @@ func (rc *ReconciliationContext) CheckPodsReady(endpointData httphelper.CassMeta // step 3 - get all nodes up // if the cluster isn't healthy, that's ok, but go back to step 1 - if !rc.isClusterHealthy() { + clusterHealthy := rc.isClusterHealthy() + if err := rc.updateHealth(clusterHealthy); err != nil { + return result.Error(err) + } + + if !clusterHealthy { rc.ReqLogger.Info( "cluster isn't healthy", ) - // FIXME this is one spot I've seen get spammy, should we raise this number? - return result.RequeueSoon(2) + return result.RequeueSoon(5) } needsMoreNodes, err := rc.startAllNodes(endpointData) @@ -1179,6 +1183,30 @@ func (rc *ReconciliationContext) UpdateStatus() result.ReconcileResult { return result.Continue() } +func (rc *ReconciliationContext) updateHealth(healthy bool) error { + updated := false + dcPatch := client.MergeFrom(rc.Datacenter.DeepCopy()) + + if !healthy { + updated = rc.setCondition( + api.NewDatacenterCondition( + api.DatacenterHealthy, corev1.ConditionFalse)) + } else { + updated = rc.setCondition( + api.NewDatacenterCondition( + api.DatacenterHealthy, corev1.ConditionTrue)) + } + + if updated { + err := rc.Client.Status().Patch(rc.Ctx, rc.Datacenter, dcPatch) + if err != nil { + return err + } + } + + return nil +} + func hasBeenXMinutes(x int, sinceTime time.Time) bool { xMinutesAgo := time.Now().Add(time.Minute * time.Duration(-x)) return sinceTime.Before(xMinutesAgo) @@ -1307,6 +1335,8 @@ func (rc *ReconciliationContext) deleteStuckNodes() (bool, error) { return false, nil } +// isClusterHealthy does a LOCAL_QUORUM query to the Cassandra pods and returns true if all the pods were able to +// respond without error. func (rc *ReconciliationContext) isClusterHealthy() bool { pods := FilterPodListByCassNodeState(rc.clusterPods, stateStarted)