Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add health and availableNodes in the OpenSearchCluster status #655

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ spec:
scope: Namespaced
versions:
- additionalPrinterColumns:
- jsonPath: .status.health
name: health
type: string
- description: Available nodes
jsonPath: .status.availableNodes
name: nodes
type: integer
- description: Opensearch version
jsonPath: .status.version
name: version
Expand Down Expand Up @@ -4978,6 +4985,10 @@ spec:
status:
description: ClusterStatus defines the observed state of Es
properties:
availableNodes:
description: AvailableNodes is the number of available instances.
format: int32
type: integer
componentsStatus:
items:
properties:
Expand All @@ -4993,6 +5004,10 @@ spec:
type: string
type: object
type: array
health:
description: OpenSearchHealth is the health of the cluster as returned
by the health API.
type: string
initialized:
type: boolean
phase:
Expand Down
16 changes: 16 additions & 0 deletions opensearch-operator/api/v1/opensearch_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@ const (
PhaseRunning = "RUNNING"
)

// OpenSearchHealth is the health of the cluster as returned by the health API.
type OpenSearchHealth string

// Possible traffic light states OpenSearch health can have.
const (
OpenSearchRedHealth OpenSearchHealth = "red"
OpenSearchYellowHealth OpenSearchHealth = "yellow"
OpenSearchGreenHealth OpenSearchHealth = "green"
OpenSearchUnknownHealth OpenSearchHealth = "unknown"
)

// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.

Expand Down Expand Up @@ -292,12 +303,17 @@ type ClusterStatus struct {
ComponentsStatus []ComponentStatus `json:"componentsStatus"`
Version string `json:"version,omitempty"`
Initialized bool `json:"initialized,omitempty"`
// AvailableNodes is the number of available instances.
AvailableNodes int32 `json:"availableNodes,omitempty"`
Health OpenSearchHealth `json:"health,omitempty"`
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:shortName=os;opensearch
// Es is the Schema for the es API
// +kubebuilder:printcolumn:name="health",type="string",JSONPath=".status.health"
// +kubebuilder:printcolumn:name="nodes",type="integer",JSONPath=".status.availableNodes",description="Available nodes"
// +kubebuilder:printcolumn:name="version",type="string",JSONPath=".status.version",description="Opensearch version"
// +kubebuilder:printcolumn:name="phase",type="string",JSONPath=".status.phase"
// +kubebuilder:printcolumn:name="age",type="date",JSONPath=".metadata.creationTimestamp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ spec:
scope: Namespaced
versions:
- additionalPrinterColumns:
- jsonPath: .status.health
name: health
type: string
- description: Available nodes
jsonPath: .status.availableNodes
name: nodes
type: integer
- description: Opensearch version
jsonPath: .status.version
name: version
Expand Down Expand Up @@ -4978,6 +4985,10 @@ spec:
status:
description: ClusterStatus defines the observed state of Es
properties:
availableNodes:
description: AvailableNodes is the number of available instances.
format: int32
type: integer
componentsStatus:
items:
properties:
Expand All @@ -4993,6 +5004,10 @@ spec:
type: string
type: object
type: array
health:
description: OpenSearchHealth is the health of the cluster as returned
by the health API.
type: string
initialized:
type: boolean
phase:
Expand Down
21 changes: 21 additions & 0 deletions opensearch-operator/pkg/reconcilers/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ func (r *ClusterReconciler) Reconcile() (ctrl.Result, error) {
})
result.CombineErr(err)
}

if r.instance.Spec.General.SnapshotRepositories != nil && len(r.instance.Spec.General.SnapshotRepositories) > 0 {
// Calculate checksum and check for changes
result.Combine(r.ReconcileSnapshotRepoConfig(username))
Expand All @@ -143,6 +144,9 @@ func (r *ClusterReconciler) Reconcile() (ctrl.Result, error) {
result.Combine(r.checkForEmptyDirRecovery())
}

// Update the CR status to reflect the current OpenSearch health and nodes
result.CombineErr(r.UpdateClusterStatus())

return result.Result, result.Err
}

Expand Down Expand Up @@ -548,3 +552,20 @@ func (r *ClusterReconciler) deleteSTSWithOrphan(existing *appsv1.StatefulSet) er
}
return nil
}

// UpdateClusterStatus updates the cluster health and number of available nodes in the CR status
func (r *ClusterReconciler) UpdateClusterStatus() error {
health := util.GetClusterHealth(r.ctx, r.Client, r.instance, r.logger)
availableNodes := util.GetAvailableOpenSearchNodes(r.ctx, r.Client, r.instance, r.logger)

err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
if err := r.Get(r.ctx, client.ObjectKeyFromObject(r.instance), r.instance); err != nil {
return err
}
r.instance.Status.Health = health
r.instance.Status.AvailableNodes = availableNodes
return r.Status().Update(r.ctx, r.instance)
})

return err
}
50 changes: 47 additions & 3 deletions opensearch-operator/pkg/reconcilers/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"crypto/sha1"
"encoding/hex"
"fmt"

"github.com/go-logr/logr"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -237,9 +239,6 @@ func CreateClientForCluster(
services.WithTransport(transport),
)
}
if err != nil {
lg.Error(err, "failed to create client")
}

return osClient, err
}
Expand Down Expand Up @@ -271,3 +270,48 @@ func GetSha1Sum(data []byte) (string, error) {

return hex.EncodeToString(hasher.Sum(nil)), nil
}

// GetClusterHealth returns the health of OpenSearch cluster
func GetClusterHealth(ctx context.Context, k8sClient client.Client, cluster *opsterv1.OpenSearchCluster, lg logr.Logger) opsterv1.OpenSearchHealth {
osClient, err := CreateClientForCluster(ctx, k8sClient, cluster, nil)

if err != nil {
lg.V(1).Info(fmt.Sprintf("Failed to create OS client while checking cluster health: %v", err))
return opsterv1.OpenSearchUnknownHealth
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error should be logged at least, for debugging purposes. For the other places also.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't want to log it because when the cluster is first coming up, it will always fail to create the client and the logs are filled with error messages. Same at other places as well, when the sts is just being created, it returns a few not found error

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe log it on debug level, then it should normally not appear as the default log level is info. And in case someone has a problem we can ask for debug logs.

}

healthResponse, err := osClient.GetClusterHealth()

if err != nil {
lg.Error(err, "Failed to get OpenSearch health status")
return opsterv1.OpenSearchUnknownHealth
}

return opsterv1.OpenSearchHealth(healthResponse.Status)
}

// GetAvailableOpenSearchNodes returns the sum of ready pods for all node pools
func GetAvailableOpenSearchNodes(ctx context.Context, k8sClient client.Client, cluster *opsterv1.OpenSearchCluster, lg logr.Logger) int32 {
clusterName := cluster.Name
clusterNamespace := cluster.Namespace

previousAvailableNodes := cluster.Status.AvailableNodes
var availableNodes int32

for _, nodePool := range cluster.Spec.NodePools {
var sts *appsv1.StatefulSet
var err error

sts, err = helpers.GetSTSForNodePool(ctx, k8sClient, nodePool, clusterName, clusterNamespace)
if err != nil {
lg.V(1).Info(fmt.Sprintf("Failed to get statefulsets for nodepool %s: %v", nodePool.Component, err))
return previousAvailableNodes
}

if sts != nil {
availableNodes += sts.Status.ReadyReplicas
}
}

return availableNodes
}
Loading