addon: Add support for logging agent health checking (#42)

* Add support for logging agent health checking * remove comment * some refactoring * fix unit test * fix probe fields * add clf and otelcol resource probes to health check * Update internal/addon/addon.go Co-authored-by: Joao Marcal <[email protected]> * Update internal/addon/addon.go Co-authored-by: Joao Marcal <[email protected]> * Update internal/addon/addon.go Co-authored-by: Joao Marcal <[email protected]> * Update internal/addon/addon_test.go Co-authored-by: Joao Marcal <[email protected]> * fixes tests and updates probe type to HealthProberTypeWork * updates health check function * fix naming of variables * add tests for logging and move variables to consts * address reviews * logging: added SA & clusterrole binding to allow for CLF not nameded instance * logging: fix tests --------- Co-authored-by: Bayan Taani <[email protected]> Co-authored-by: Joao Marcal <[email protected]> Co-authored-by: Joao Marcal <[email protected]>
stolostron · Jun 6, 2024 · 5f4b5e2 · 5f4b5e2
1 parent a40750d
commit 5f4b5e2
Show file tree

Hide file tree

Showing 16 changed files with 298 additions and 24 deletions.
diff --git a/internal/addon/addon.go b/internal/addon/addon.go
@@ -1,12 +1,24 @@
 package addon
 
 import (
+	"errors"
+	"fmt"
+
+	otelv1alpha1 "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1"
+	loggingv1 "github.com/openshift/cluster-logging-operator/apis/logging/v1"
 	"open-cluster-management.io/addon-framework/pkg/agent"
 	"open-cluster-management.io/addon-framework/pkg/utils"
 	addonapiv1alpha1 "open-cluster-management.io/api/addon/v1alpha1"
+	workv1 "open-cluster-management.io/api/work/v1"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
+var (
+	errProbeConditionNotSatisfied = errors.New("probe condition is not satisfied")
+	errProbeValueIsNil            = errors.New("probe value is nil")
+	errValueNotProbed             = errors.New("value not probed")
+)
+
 func NewRegistrationOption(agentName string) *agent.RegistrationOption {
 	return &agent.RegistrationOption{
 		CSRConfigurations: agent.KubeClientSignerConfigurations(Name, agentName),
@@ -30,3 +42,90 @@ func GetObjectKey(configRef []addonapiv1alpha1.ConfigReference, group, resource
 	}
 	return key
 }
+
+// AgentHealthProber returns a HealthProber struct that contains the necessary
+// information to assert if an addon deployment is ready or not.
+func AgentHealthProber() *agent.HealthProber {
+	return &agent.HealthProber{
+		Type: agent.HealthProberTypeWork,
+		WorkProber: &agent.WorkHealthProber{
+			ProbeFields: []agent.ProbeField{
+				{
+					ResourceIdentifier: workv1.ResourceIdentifier{
+						Group:     loggingv1.GroupVersion.Group,
+						Resource:  ClusterLogForwardersResource,
+						Name:      SpokeCLFName,
+						Namespace: SpokeCLFNamespace,
+					},
+					ProbeRules: []workv1.FeedbackRule{
+						{
+							Type: workv1.JSONPathsType,
+							JsonPaths: []workv1.JsonPath{
+								{
+									Name: clfProbeKey,
+									Path: clfProbePath,
+								},
+							},
+						},
+					},
+				},
+				{
+					ResourceIdentifier: workv1.ResourceIdentifier{
+						Group:     otelv1alpha1.GroupVersion.Group,
+						Resource:  OpenTelemetryCollectorsResource,
+						Name:      SpokeOTELColName,
+						Namespace: SpokeOTELColNamespace,
+					},
+					ProbeRules: []workv1.FeedbackRule{
+						{
+							Type: workv1.JSONPathsType,
+							JsonPaths: []workv1.JsonPath{
+								{
+									Name: otelColProbeKey,
+									Path: otelColProbePath,
+								},
+							},
+						},
+					},
+				},
+			},
+			HealthCheck: func(identifier workv1.ResourceIdentifier, result workv1.StatusFeedbackResult) error {
+				for _, value := range result.Values {
+					switch {
+					case identifier.Resource == ClusterLogForwardersResource:
+						if value.Name != clfProbeKey {
+							continue
+						}
+
+						if value.Value.String == nil {
+							return fmt.Errorf("%w: clusterlogforwarder with key %s/%s", errProbeValueIsNil, identifier.Namespace, identifier.Name)
+						}
+
+						if *value.Value.String != "True" {
+							return fmt.Errorf("%w: clusterlogforwarder status condition type is %s for %s/%s", errProbeConditionNotSatisfied, *value.Value.String, identifier.Namespace, identifier.Name)
+						}
+
+						return nil
+					case identifier.Resource == OpenTelemetryCollectorsResource:
+						if value.Name != otelColProbeKey {
+							continue
+						}
+
+						if value.Value.Integer == nil {
+							return fmt.Errorf("%w: opentelemetrycollector with key %s/%s", errProbeValueIsNil, identifier.Namespace, identifier.Name)
+						}
+
+						if *value.Value.Integer < 1 {
+							return fmt.Errorf("%w: opentelemetrycollector replicas is %d for %s/%s", errProbeConditionNotSatisfied, *value.Value.Integer, identifier.Namespace, identifier.Name)
+						}
+
+						return nil
+					default:
+						continue
+					}
+				}
+				return fmt.Errorf("%w: for resource %s with key %s/%s", errValueNotProbed, identifier.Resource, identifier.Namespace, identifier.Name)
+			},
+		},
+	}
+}
diff --git a/internal/addon/addon_test.go b/internal/addon/addon_test.go
@@ -0,0 +1,100 @@
+package addon
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+	workv1 "open-cluster-management.io/api/work/v1"
+
+	otelv1alpha1 "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1"
+	loggingv1 "github.com/openshift/cluster-logging-operator/apis/logging/v1"
+)
+
+func Test_AgentHealthProber_CLF(t *testing.T) {
+	unhealthyError := fmt.Errorf("%w: clusterlogforwarder status condition type is %s for %s/%s", errProbeConditionNotSatisfied, "False", SpokeCLFNamespace, SpokeCLFName)
+	for _, tc := range []struct {
+		name        string
+		status      string
+		expectedErr string
+	}{
+		{
+			name:   "healthy",
+			status: "True",
+		},
+		{
+			name:        "unhealthy",
+			status:      "False",
+			expectedErr: unhealthyError.Error(),
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			healthProber := AgentHealthProber()
+			err := healthProber.WorkProber.HealthCheck(workv1.ResourceIdentifier{
+				Group:     loggingv1.GroupVersion.Group,
+				Resource:  ClusterLogForwardersResource,
+				Name:      SpokeCLFName,
+				Namespace: SpokeCLFNamespace,
+			}, workv1.StatusFeedbackResult{
+				Values: []workv1.FeedbackValue{
+					{
+						Name: "isReady",
+						Value: workv1.FieldValue{
+							Type:   workv1.String,
+							String: &tc.status,
+						},
+					},
+				},
+			})
+			if tc.expectedErr != "" {
+				require.EqualError(t, err, tc.expectedErr)
+				return
+			}
+			require.NoError(t, err)
+		})
+	}
+}
+
+func Test_AgentHealthProber_OTELCol(t *testing.T) {
+	unhealthyError := fmt.Errorf("%w: opentelemetrycollector replicas is %d for %s/%s", errProbeConditionNotSatisfied, 0, SpokeOTELColNamespace, SpokeOTELColName)
+	for _, tc := range []struct {
+		name        string
+		replicas    int64
+		expectedErr string
+	}{
+		{
+			name:     "healthy",
+			replicas: 1,
+		},
+		{
+			name:        "unhealthy",
+			replicas:    0,
+			expectedErr: unhealthyError.Error(),
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			healthProber := AgentHealthProber()
+			err := healthProber.WorkProber.HealthCheck(workv1.ResourceIdentifier{
+				Group:     otelv1alpha1.GroupVersion.Group,
+				Resource:  OpenTelemetryCollectorsResource,
+				Name:      SpokeOTELColName,
+				Namespace: SpokeOTELColNamespace,
+			}, workv1.StatusFeedbackResult{
+				Values: []workv1.FeedbackValue{
+					{
+						Name: "replicas",
+						Value: workv1.FieldValue{
+							Type:    workv1.Integer,
+							Integer: &tc.replicas,
+						},
+					},
+				},
+			})
+			if tc.expectedErr != "" {
+				require.EqualError(t, err, tc.expectedErr)
+				return
+			}
+			require.NoError(t, err)
+		})
+	}
+}
diff --git a/internal/addon/manifests/charts/mcoa/charts/logging/templates/clusterlogforwarder.yaml b/internal/addon/manifests/charts/mcoa/charts/logging/templates/clusterlogforwarder.yaml
@@ -2,7 +2,7 @@
 apiVersion: logging.openshift.io/v1
 kind: ClusterLogForwarder
 metadata:
-  name: instance
+  name: mcoa-instance
   namespace: openshift-logging
   labels:
     app: {{ template "logginghelm.name" . }}

diff --git a/internal/addon/manifests/charts/mcoa/charts/logging/templates/clusterrole-binding.yaml b/internal/addon/manifests/charts/mcoa/charts/logging/templates/clusterrole-binding.yaml
@@ -0,0 +1,52 @@
+{{- if .Values.enabled }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: openshift-logging:mcoa-logcollector:application-logs
+  labels:
+    app: {{ template "logginghelm.name" . }}
+    chart: {{ template "logginghelm.chart" . }}
+    release: {{ .Release.Name }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: collect-application-logs
+subjects:
+  - kind: ServiceAccount
+    name: mcoa-logcollector
+    namespace: openshift-logging
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: openshift-logging:mcoa-logcollector:audit-logs
+  labels:
+    app: {{ template "logginghelm.name" . }}
+    chart: {{ template "logginghelm.chart" . }}
+    release: {{ .Release.Name }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: collect-audit-logs
+subjects:
+  - kind: ServiceAccount
+    name: mcoa-logcollector
+    namespace: openshift-logging
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: openshift-logging:mcoa-logcollector:infrastructure-logs
+  labels:
+    app: {{ template "logginghelm.name" . }}
+    chart: {{ template "logginghelm.chart" . }}
+    release: {{ .Release.Name }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: collect-infrastructure-logs
+subjects:
+  - kind: ServiceAccount
+    name: mcoa-logcollector
+    namespace: openshift-logging
+{{- end }}
diff --git a/internal/addon/manifests/charts/mcoa/charts/logging/templates/service-account.yaml b/internal/addon/manifests/charts/mcoa/charts/logging/templates/service-account.yaml
@@ -0,0 +1,11 @@
+{{- if .Values.enabled }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: mcoa-logcollector
+  namespace: openshift-logging
+  labels:
+    app: {{ template "logginghelm.name" . }}
+    chart: {{ template "logginghelm.chart" . }}
+    release: {{ .Release.Name }}
+{{- end }}
diff --git a/internal/addon/manifests/charts/mcoa/charts/tracing/templates/namespace.yaml b/internal/addon/manifests/charts/mcoa/charts/tracing/templates/namespace.yaml
@@ -11,7 +11,7 @@ metadata:
 apiVersion: v1
 kind: Namespace
 metadata:
-  name: spoke-otelcol
+  name: mcoa-opentelemetry
   labels:
     app: {{ template "tracinghelm.name" . }}
     chart: {{ template "tracinghelm.chart" . }}

diff --git a/internal/addon/manifests/charts/mcoa/charts/tracing/templates/opentelemetrycollector.yaml b/internal/addon/manifests/charts/mcoa/charts/tracing/templates/opentelemetrycollector.yaml
@@ -2,8 +2,8 @@
 apiVersion: opentelemetry.io/v1beta1
 kind: OpenTelemetryCollector
 metadata:
-  name: spoke-otelcol
-  namespace: spoke-otelcol
+  name: mcoa-instance
+  namespace: mcoa-opentelemetry
   labels:
     app: {{ template "tracinghelm.name" . }}
     chart: {{ template "tracinghelm.chart" . }}

diff --git a/internal/addon/manifests/charts/mcoa/charts/tracing/templates/secret.yaml b/internal/addon/manifests/charts/mcoa/charts/tracing/templates/secret.yaml
@@ -4,7 +4,7 @@ apiVersion: v1
 kind: Secret
 metadata:
   name: {{ $secret_config.name }}
-  namespace: spoke-otelcol
+  namespace: mcoa-opentelemetry
   labels:
     app: {{ template "tracinghelm.name" $ }}
     chart: {{ template "tracinghelm.chart" $ }}

diff --git a/internal/addon/var.go b/internal/addon/var.go
@@ -16,6 +16,18 @@ const (
 
 	AdcLoggingDisabledKey = "loggingDisabled"
 	AdcTracingisabledKey  = "tracingDisabled"
+
+	ClusterLogForwardersResource = "clusterlogforwarders"
+	SpokeCLFName                 = "mcoa-instance"
+	SpokeCLFNamespace            = "openshift-logging"
+	clfProbeKey                  = "isReady"
+	clfProbePath                 = ".status.conditions[?(@.type==\"Ready\")].status"
+
+	OpenTelemetryCollectorsResource = "opentelemetrycollectors"
+	SpokeOTELColName                = "mcoa-instance"
+	SpokeOTELColNamespace           = "mcoa-opentelemetry"
+	otelColProbeKey                 = "replicas"
+	otelColProbePath                = ".spec.replicas"
 )
 
 //go:embed manifests

diff --git a/internal/logging/handlers/handler.go b/internal/logging/handlers/handler.go
@@ -11,16 +11,12 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
-const (
-	clusterLogForwarderResource = "clusterlogforwarders"
-)
-
 func BuildOptions(ctx context.Context, k8s client.Client, mcAddon *addonapiv1alpha1.ManagedClusterAddOn, adoc *addonapiv1alpha1.AddOnDeploymentConfig) (manifests.Options, error) {
 	resources := manifests.Options{
 		AddOnDeploymentConfig: adoc,
 	}
 
-	key := addon.GetObjectKey(mcAddon.Status.ConfigReferences, loggingv1.GroupVersion.Group, clusterLogForwarderResource)
+	key := addon.GetObjectKey(mcAddon.Status.ConfigReferences, loggingv1.GroupVersion.Group, addon.ClusterLogForwardersResource)
 	clf := &loggingv1.ClusterLogForwarder{}
 	if err := k8s.Get(ctx, key, clf, &client.GetOptions{}); err != nil {
 		return resources, err

diff --git a/internal/logging/helm_test.go b/internal/logging/helm_test.go
@@ -92,15 +92,15 @@ func Test_Logging_AllConfigsTogether_AllResources(t *testing.T) {
 			},
 			ConfigReferent: addonapiv1alpha1.ConfigReferent{
 				Namespace: "open-cluster-management",
-				Name:      "instance",
+				Name:      "mcoa-instance",
 			},
 		},
 	}
 
 	// Setup configuration resources: ClusterLogForwarder, AddOnDeploymentConfig, Secrets, ConfigMaps
 	clf = &loggingv1.ClusterLogForwarder{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      "instance",
+			Name:      "mcoa-instance",
 			Namespace: "open-cluster-management",
 			Annotations: map[string]string{
 				"authentication.mcoa.openshift.io/app-logs":     "SecretReference",
@@ -215,7 +215,7 @@ func Test_Logging_AllConfigsTogether_AllResources(t *testing.T) {
 	// Render manifests and return them as k8s runtime objects
 	objects, err := loggingAgentAddon.Manifests(managedCluster, managedClusterAddOn)
 	require.NoError(t, err)
-	require.Equal(t, 7, len(objects))
+	require.Equal(t, 11, len(objects))
 
 	for _, obj := range objects {
 		switch obj := obj.(type) {
@@ -226,6 +226,10 @@ func Test_Logging_AllConfigsTogether_AllResources(t *testing.T) {
 			require.NotNil(t, obj.Spec.Outputs[1].Secret)
 			require.Equal(t, "static-authentication", obj.Spec.Outputs[0].Secret.Name)
 			require.Equal(t, "static-authentication", obj.Spec.Outputs[1].Secret.Name)
+			// Check name and namespace to make sure that if we change the helm
+			// manifests that we don't break the addon probes
+			require.Equal(t, addon.SpokeCLFName, obj.Name)
+			require.Equal(t, addon.SpokeCLFNamespace, obj.Namespace)
 		case *corev1.Secret:
 			if obj.Name == "static-authentication" {
 				require.Equal(t, staticCred.Data, obj.Data)