Skip to content

Commit

Permalink
Create health condition for degraded status / quorum issues (#360)
Browse files Browse the repository at this point in the history
* Add DatacenterHealth condition

* Modify the DatacenterHealth status update to occur on the starting nodes part

* Fix rebase

* Add description to DatacenterHealthy

* Add lint to the Makefile, remove unused isDegraded

(cherry picked from commit b498b7e)
  • Loading branch information
burmanm committed Jul 18, 2022
1 parent ea143a9 commit 5215c1c
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti

## unreleased

* [ENHANCEMENT] [#360](https://github.com/k8ssandra/cass-operator/pull/360) If Datacenter quorum reports unhealthy state, change Status Condition DatacenterHealthy to False (DBPE-2283)
* [BUGFIX] [#355](https://github.com/k8ssandra/cass-operator/issues/335) Cleanse label values derived from cluster name, which can contain illegal chars.
* [BUGFIX] [#330](https://github.com/k8ssandra/cass-operator/issues/330) Apply correct updates to Service labels and annotations through additionalServiceConfig (they are now validated and don't allow reserved prefixes).
* [BUGFIX] [#368](https://github.com/k8ssandra/cass-operator/issues/368) Do not fetch endpointStatus from pods that have not started
Expand Down
4 changes: 4 additions & 0 deletions apis/cassandra/v1beta1/cassandradatacenter_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,10 @@ const (
DatacenterRollingRestart DatacenterConditionType = "RollingRestart"
DatacenterValid DatacenterConditionType = "Valid"
DatacenterDecommission DatacenterConditionType = "Decommission"

// DatacenterHealthy indicates if QUORUM can be reached from all deployed nodes.
// If this check fails, certain operations such as scaling up will not proceed.
DatacenterHealthy DatacenterConditionType = "Healthy"
)

type DatacenterCondition struct {
Expand Down
36 changes: 33 additions & 3 deletions pkg/reconciliation/reconcile_racks.go
Original file line number Diff line number Diff line change
Expand Up @@ -623,12 +623,16 @@ func (rc *ReconciliationContext) CheckPodsReady(endpointData httphelper.CassMeta

// step 3 - get all nodes up
// if the cluster isn't healthy, that's ok, but go back to step 1
if !rc.isClusterHealthy() {
clusterHealthy := rc.isClusterHealthy()
if err := rc.updateHealth(clusterHealthy); err != nil {
return result.Error(err)
}

if !clusterHealthy {
rc.ReqLogger.Info(
"cluster isn't healthy",
)
// FIXME this is one spot I've seen get spammy, should we raise this number?
return result.RequeueSoon(2)
return result.RequeueSoon(5)
}

needsMoreNodes, err := rc.startAllNodes(endpointData)
Expand Down Expand Up @@ -1179,6 +1183,30 @@ func (rc *ReconciliationContext) UpdateStatus() result.ReconcileResult {
return result.Continue()
}

func (rc *ReconciliationContext) updateHealth(healthy bool) error {
updated := false
dcPatch := client.MergeFrom(rc.Datacenter.DeepCopy())

if !healthy {
updated = rc.setCondition(
api.NewDatacenterCondition(
api.DatacenterHealthy, corev1.ConditionFalse))
} else {
updated = rc.setCondition(
api.NewDatacenterCondition(
api.DatacenterHealthy, corev1.ConditionTrue))
}

if updated {
err := rc.Client.Status().Patch(rc.Ctx, rc.Datacenter, dcPatch)
if err != nil {
return err
}
}

return nil
}

func hasBeenXMinutes(x int, sinceTime time.Time) bool {
xMinutesAgo := time.Now().Add(time.Minute * time.Duration(-x))
return sinceTime.Before(xMinutesAgo)
Expand Down Expand Up @@ -1307,6 +1335,8 @@ func (rc *ReconciliationContext) deleteStuckNodes() (bool, error) {
return false, nil
}

// isClusterHealthy does a LOCAL_QUORUM query to the Cassandra pods and returns true if all the pods were able to
// respond without error.
func (rc *ReconciliationContext) isClusterHealthy() bool {
pods := FilterPodListByCassNodeState(rc.clusterPods, stateStarted)

Expand Down

0 comments on commit 5215c1c

Please sign in to comment.