Skip to content

Commit

Permalink
addon: Add support for logging agent health checking (#42)
Browse files Browse the repository at this point in the history
* Add support for logging agent health checking

* remove comment

* some refactoring

* fix unit test

* fix probe fields

* add clf and otelcol resource probes to health check

* Update internal/addon/addon.go

Co-authored-by: Joao Marcal <[email protected]>

* Update internal/addon/addon.go

Co-authored-by: Joao Marcal <[email protected]>

* Update internal/addon/addon.go

Co-authored-by: Joao Marcal <[email protected]>

* Update internal/addon/addon_test.go

Co-authored-by: Joao Marcal <[email protected]>

* fixes tests and updates probe type to HealthProberTypeWork

* updates health check function

* fix naming of variables

* add tests for logging and move variables to consts

* address reviews

* logging: added SA & clusterrole binding to allow for CLF not nameded
instance

* logging: fix tests

---------

Co-authored-by: Bayan Taani <[email protected]>
Co-authored-by: Joao Marcal <[email protected]>
Co-authored-by: Joao Marcal <[email protected]>
  • Loading branch information
4 people authored Jun 6, 2024
1 parent a40750d commit 5f4b5e2
Show file tree
Hide file tree
Showing 16 changed files with 298 additions and 24 deletions.
99 changes: 99 additions & 0 deletions internal/addon/addon.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,24 @@
package addon

import (
"errors"
"fmt"

otelv1alpha1 "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1"
loggingv1 "github.com/openshift/cluster-logging-operator/apis/logging/v1"
"open-cluster-management.io/addon-framework/pkg/agent"
"open-cluster-management.io/addon-framework/pkg/utils"
addonapiv1alpha1 "open-cluster-management.io/api/addon/v1alpha1"
workv1 "open-cluster-management.io/api/work/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

var (
errProbeConditionNotSatisfied = errors.New("probe condition is not satisfied")
errProbeValueIsNil = errors.New("probe value is nil")
errValueNotProbed = errors.New("value not probed")
)

func NewRegistrationOption(agentName string) *agent.RegistrationOption {
return &agent.RegistrationOption{
CSRConfigurations: agent.KubeClientSignerConfigurations(Name, agentName),
Expand All @@ -30,3 +42,90 @@ func GetObjectKey(configRef []addonapiv1alpha1.ConfigReference, group, resource
}
return key
}

// AgentHealthProber returns a HealthProber struct that contains the necessary
// information to assert if an addon deployment is ready or not.
func AgentHealthProber() *agent.HealthProber {
return &agent.HealthProber{
Type: agent.HealthProberTypeWork,
WorkProber: &agent.WorkHealthProber{
ProbeFields: []agent.ProbeField{
{
ResourceIdentifier: workv1.ResourceIdentifier{
Group: loggingv1.GroupVersion.Group,
Resource: ClusterLogForwardersResource,
Name: SpokeCLFName,
Namespace: SpokeCLFNamespace,
},
ProbeRules: []workv1.FeedbackRule{
{
Type: workv1.JSONPathsType,
JsonPaths: []workv1.JsonPath{
{
Name: clfProbeKey,
Path: clfProbePath,
},
},
},
},
},
{
ResourceIdentifier: workv1.ResourceIdentifier{
Group: otelv1alpha1.GroupVersion.Group,
Resource: OpenTelemetryCollectorsResource,
Name: SpokeOTELColName,
Namespace: SpokeOTELColNamespace,
},
ProbeRules: []workv1.FeedbackRule{
{
Type: workv1.JSONPathsType,
JsonPaths: []workv1.JsonPath{
{
Name: otelColProbeKey,
Path: otelColProbePath,
},
},
},
},
},
},
HealthCheck: func(identifier workv1.ResourceIdentifier, result workv1.StatusFeedbackResult) error {
for _, value := range result.Values {
switch {
case identifier.Resource == ClusterLogForwardersResource:
if value.Name != clfProbeKey {
continue
}

if value.Value.String == nil {
return fmt.Errorf("%w: clusterlogforwarder with key %s/%s", errProbeValueIsNil, identifier.Namespace, identifier.Name)
}

if *value.Value.String != "True" {
return fmt.Errorf("%w: clusterlogforwarder status condition type is %s for %s/%s", errProbeConditionNotSatisfied, *value.Value.String, identifier.Namespace, identifier.Name)
}

return nil
case identifier.Resource == OpenTelemetryCollectorsResource:
if value.Name != otelColProbeKey {
continue
}

if value.Value.Integer == nil {
return fmt.Errorf("%w: opentelemetrycollector with key %s/%s", errProbeValueIsNil, identifier.Namespace, identifier.Name)
}

if *value.Value.Integer < 1 {
return fmt.Errorf("%w: opentelemetrycollector replicas is %d for %s/%s", errProbeConditionNotSatisfied, *value.Value.Integer, identifier.Namespace, identifier.Name)
}

return nil
default:
continue
}
}
return fmt.Errorf("%w: for resource %s with key %s/%s", errValueNotProbed, identifier.Resource, identifier.Namespace, identifier.Name)
},
},
}
}
100 changes: 100 additions & 0 deletions internal/addon/addon_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package addon

import (
"fmt"
"testing"

"github.com/stretchr/testify/require"
workv1 "open-cluster-management.io/api/work/v1"

otelv1alpha1 "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1"
loggingv1 "github.com/openshift/cluster-logging-operator/apis/logging/v1"
)

func Test_AgentHealthProber_CLF(t *testing.T) {
unhealthyError := fmt.Errorf("%w: clusterlogforwarder status condition type is %s for %s/%s", errProbeConditionNotSatisfied, "False", SpokeCLFNamespace, SpokeCLFName)
for _, tc := range []struct {
name string
status string
expectedErr string
}{
{
name: "healthy",
status: "True",
},
{
name: "unhealthy",
status: "False",
expectedErr: unhealthyError.Error(),
},
} {
t.Run(tc.name, func(t *testing.T) {
healthProber := AgentHealthProber()
err := healthProber.WorkProber.HealthCheck(workv1.ResourceIdentifier{
Group: loggingv1.GroupVersion.Group,
Resource: ClusterLogForwardersResource,
Name: SpokeCLFName,
Namespace: SpokeCLFNamespace,
}, workv1.StatusFeedbackResult{
Values: []workv1.FeedbackValue{
{
Name: "isReady",
Value: workv1.FieldValue{
Type: workv1.String,
String: &tc.status,
},
},
},
})
if tc.expectedErr != "" {
require.EqualError(t, err, tc.expectedErr)
return
}
require.NoError(t, err)
})
}
}

func Test_AgentHealthProber_OTELCol(t *testing.T) {
unhealthyError := fmt.Errorf("%w: opentelemetrycollector replicas is %d for %s/%s", errProbeConditionNotSatisfied, 0, SpokeOTELColNamespace, SpokeOTELColName)
for _, tc := range []struct {
name string
replicas int64
expectedErr string
}{
{
name: "healthy",
replicas: 1,
},
{
name: "unhealthy",
replicas: 0,
expectedErr: unhealthyError.Error(),
},
} {
t.Run(tc.name, func(t *testing.T) {
healthProber := AgentHealthProber()
err := healthProber.WorkProber.HealthCheck(workv1.ResourceIdentifier{
Group: otelv1alpha1.GroupVersion.Group,
Resource: OpenTelemetryCollectorsResource,
Name: SpokeOTELColName,
Namespace: SpokeOTELColNamespace,
}, workv1.StatusFeedbackResult{
Values: []workv1.FeedbackValue{
{
Name: "replicas",
Value: workv1.FieldValue{
Type: workv1.Integer,
Integer: &tc.replicas,
},
},
},
})
if tc.expectedErr != "" {
require.EqualError(t, err, tc.expectedErr)
return
}
require.NoError(t, err)
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
apiVersion: logging.openshift.io/v1
kind: ClusterLogForwarder
metadata:
name: instance
name: mcoa-instance
namespace: openshift-logging
labels:
app: {{ template "logginghelm.name" . }}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{{- if .Values.enabled }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: openshift-logging:mcoa-logcollector:application-logs
labels:
app: {{ template "logginghelm.name" . }}
chart: {{ template "logginghelm.chart" . }}
release: {{ .Release.Name }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: collect-application-logs
subjects:
- kind: ServiceAccount
name: mcoa-logcollector
namespace: openshift-logging
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: openshift-logging:mcoa-logcollector:audit-logs
labels:
app: {{ template "logginghelm.name" . }}
chart: {{ template "logginghelm.chart" . }}
release: {{ .Release.Name }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: collect-audit-logs
subjects:
- kind: ServiceAccount
name: mcoa-logcollector
namespace: openshift-logging
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: openshift-logging:mcoa-logcollector:infrastructure-logs
labels:
app: {{ template "logginghelm.name" . }}
chart: {{ template "logginghelm.chart" . }}
release: {{ .Release.Name }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: collect-infrastructure-logs
subjects:
- kind: ServiceAccount
name: mcoa-logcollector
namespace: openshift-logging
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{{- if .Values.enabled }}
apiVersion: v1
kind: ServiceAccount
metadata:
name: mcoa-logcollector
namespace: openshift-logging
labels:
app: {{ template "logginghelm.name" . }}
chart: {{ template "logginghelm.chart" . }}
release: {{ .Release.Name }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ metadata:
apiVersion: v1
kind: Namespace
metadata:
name: spoke-otelcol
name: mcoa-opentelemetry
labels:
app: {{ template "tracinghelm.name" . }}
chart: {{ template "tracinghelm.chart" . }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: spoke-otelcol
namespace: spoke-otelcol
name: mcoa-instance
namespace: mcoa-opentelemetry
labels:
app: {{ template "tracinghelm.name" . }}
chart: {{ template "tracinghelm.chart" . }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ apiVersion: v1
kind: Secret
metadata:
name: {{ $secret_config.name }}
namespace: spoke-otelcol
namespace: mcoa-opentelemetry
labels:
app: {{ template "tracinghelm.name" $ }}
chart: {{ template "tracinghelm.chart" $ }}
Expand Down
12 changes: 12 additions & 0 deletions internal/addon/var.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,18 @@ const (

AdcLoggingDisabledKey = "loggingDisabled"
AdcTracingisabledKey = "tracingDisabled"

ClusterLogForwardersResource = "clusterlogforwarders"
SpokeCLFName = "mcoa-instance"
SpokeCLFNamespace = "openshift-logging"
clfProbeKey = "isReady"
clfProbePath = ".status.conditions[?(@.type==\"Ready\")].status"

OpenTelemetryCollectorsResource = "opentelemetrycollectors"
SpokeOTELColName = "mcoa-instance"
SpokeOTELColNamespace = "mcoa-opentelemetry"
otelColProbeKey = "replicas"
otelColProbePath = ".spec.replicas"
)

//go:embed manifests
Expand Down
6 changes: 1 addition & 5 deletions internal/logging/handlers/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,12 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

const (
clusterLogForwarderResource = "clusterlogforwarders"
)

func BuildOptions(ctx context.Context, k8s client.Client, mcAddon *addonapiv1alpha1.ManagedClusterAddOn, adoc *addonapiv1alpha1.AddOnDeploymentConfig) (manifests.Options, error) {
resources := manifests.Options{
AddOnDeploymentConfig: adoc,
}

key := addon.GetObjectKey(mcAddon.Status.ConfigReferences, loggingv1.GroupVersion.Group, clusterLogForwarderResource)
key := addon.GetObjectKey(mcAddon.Status.ConfigReferences, loggingv1.GroupVersion.Group, addon.ClusterLogForwardersResource)
clf := &loggingv1.ClusterLogForwarder{}
if err := k8s.Get(ctx, key, clf, &client.GetOptions{}); err != nil {
return resources, err
Expand Down
10 changes: 7 additions & 3 deletions internal/logging/helm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,15 @@ func Test_Logging_AllConfigsTogether_AllResources(t *testing.T) {
},
ConfigReferent: addonapiv1alpha1.ConfigReferent{
Namespace: "open-cluster-management",
Name: "instance",
Name: "mcoa-instance",
},
},
}

// Setup configuration resources: ClusterLogForwarder, AddOnDeploymentConfig, Secrets, ConfigMaps
clf = &loggingv1.ClusterLogForwarder{
ObjectMeta: metav1.ObjectMeta{
Name: "instance",
Name: "mcoa-instance",
Namespace: "open-cluster-management",
Annotations: map[string]string{
"authentication.mcoa.openshift.io/app-logs": "SecretReference",
Expand Down Expand Up @@ -215,7 +215,7 @@ func Test_Logging_AllConfigsTogether_AllResources(t *testing.T) {
// Render manifests and return them as k8s runtime objects
objects, err := loggingAgentAddon.Manifests(managedCluster, managedClusterAddOn)
require.NoError(t, err)
require.Equal(t, 7, len(objects))
require.Equal(t, 11, len(objects))

for _, obj := range objects {
switch obj := obj.(type) {
Expand All @@ -226,6 +226,10 @@ func Test_Logging_AllConfigsTogether_AllResources(t *testing.T) {
require.NotNil(t, obj.Spec.Outputs[1].Secret)
require.Equal(t, "static-authentication", obj.Spec.Outputs[0].Secret.Name)
require.Equal(t, "static-authentication", obj.Spec.Outputs[1].Secret.Name)
// Check name and namespace to make sure that if we change the helm
// manifests that we don't break the addon probes
require.Equal(t, addon.SpokeCLFName, obj.Name)
require.Equal(t, addon.SpokeCLFNamespace, obj.Namespace)
case *corev1.Secret:
if obj.Name == "static-authentication" {
require.Equal(t, staticCred.Data, obj.Data)
Expand Down
Loading

0 comments on commit 5f4b5e2

Please sign in to comment.