Skip to content

Commit

Permalink
Add observability controller alerts (#3272)
Browse files Browse the repository at this point in the history
* Refactor hyperconverged monitoring into separate package

Signed-off-by: João Vilaça <[email protected]>

* Add base for observability controller Prometheus rules

Signed-off-by: João Vilaça <[email protected]>

* Add HighCPUWorkload alert

Signed-off-by: João Vilaça <[email protected]>

* Add HAControlPlaneDown alert

Signed-off-by: João Vilaça <[email protected]>

* Add NodeNetworkInterfaceDown alert

Signed-off-by: João Vilaça <[email protected]>

* Update observability func tests

Signed-off-by: João Vilaça <[email protected]>

* Add observability controller unit tests

Signed-off-by: João Vilaça <[email protected]>

---------

Signed-off-by: João Vilaça <[email protected]>
  • Loading branch information
machadovilaca authored Jan 19, 2025
1 parent c4ab624 commit fbf6052
Show file tree
Hide file tree
Showing 43 changed files with 673 additions and 83 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ current-dir := $(realpath .)
prom-rules-verify: build-prom-spec-dumper
./hack/prom-rule-ci/verify-rules.sh \
"${current-dir}/_out/rule-spec-dumper" \
"${current-dir}/hack/prom-rule-ci/prom-rules-tests.yaml"
"${current-dir}/hack/prom-rule-ci"

install:
go install ./cmd/...
Expand Down
2 changes: 1 addition & 1 deletion api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion api/v1beta1/zz_generated.defaults.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion api/v1beta1/zz_generated.openapi.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions cmd/hyperconverged-cluster-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ import (
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/nodes"
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/observability"
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/operands"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/hyperconverged/metrics"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/upgradepatch"
hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util"
)
Expand Down Expand Up @@ -195,7 +195,7 @@ func main() {
}

if ci.IsOpenshift() {
if err = observability.SetupWithManager(mgr); err != nil {
if err = observability.SetupWithManager(mgr, ci.GetDeployment()); err != nil {
logger.Error(err, "unable to create controller", "controller", "Observability")
os.Exit(1)
}
Expand Down
2 changes: 1 addition & 1 deletion controllers/alerts/alerts.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/rules"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/hyperconverged/rules"
hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util"
)

Expand Down
4 changes: 2 additions & 2 deletions controllers/alerts/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ import (

"github.com/kubevirt/hyperconverged-cluster-operator/controllers/common"
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/commontestutils"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/rules"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/hyperconverged/metrics"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/hyperconverged/rules"
hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util"
)

Expand Down
2 changes: 1 addition & 1 deletion controllers/alerts/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (
logf "sigs.k8s.io/controller-runtime/pkg/log"

"github.com/kubevirt/hyperconverged-cluster-operator/controllers/common"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/hyperconverged/metrics"
hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util"
)

Expand Down
2 changes: 1 addition & 1 deletion controllers/descheduler/descheduler_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/source"

"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/hyperconverged/metrics"
hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util"
)

Expand Down
2 changes: 1 addition & 1 deletion controllers/hyperconverged/hyperconverged_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ import (
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/alerts"
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/common"
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/operands"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/hyperconverged/metrics"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/upgradepatch"
hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util"
"github.com/kubevirt/hyperconverged-cluster-operator/version"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ import (
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/common"
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/commontestutils"
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/operands"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/hyperconverged/metrics"
hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util"
"github.com/kubevirt/hyperconverged-cluster-operator/version"
)
Expand Down
49 changes: 49 additions & 0 deletions controllers/observability/alerts.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package observability

import (
"context"
"fmt"
"reflect"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"

"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/observability/rules"
)

func (r *Reconciler) ReconcileAlerts(ctx context.Context) error {
desiredPromRule, err := rules.BuildPrometheusRule(r.namespace, r.owner)
if err != nil {
return fmt.Errorf("failed to build PrometheusRule: %v", err)
}

existingPromRule := &promv1.PrometheusRule{}
err = r.Get(ctx, types.NamespacedName{
Name: desiredPromRule.Name,
Namespace: desiredPromRule.Namespace,
}, existingPromRule)

if err != nil {
if apierrors.IsNotFound(err) {
// if it doesn't exist, create it
if createErr := r.Create(ctx, desiredPromRule); createErr != nil {
return fmt.Errorf("failed to create PrometheusRule: %v", createErr)
}

return nil
}

return fmt.Errorf("failed to get PrometheusRule: %v", err)
}

// if it does exist, compare specs and update if different
if !reflect.DeepEqual(existingPromRule.Spec, desiredPromRule.Spec) {
existingPromRule.Spec = desiredPromRule.Spec
if updateErr := r.Update(ctx, existingPromRule); updateErr != nil {
return fmt.Errorf("failed to update PrometheusRule: %v", updateErr)
}
}

return nil
}
72 changes: 72 additions & 0 deletions controllers/observability/alerts_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package observability_test

import (
"context"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
appsv1 "k8s.io/api/apps/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"

"github.com/kubevirt/hyperconverged-cluster-operator/controllers/commontestutils"
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/observability"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/observability/rules"
)

const namespace = "observability_test"

var logger = logf.Log.WithName("observability-controller")

var _ = Describe("Reconcile Alerts", func() {
var (
reconciler *observability.Reconciler
cl client.Client
promRules *promv1.PrometheusRule
)

BeforeEach(func() {
err := rules.SetupRules()
Expect(err).ToNot(HaveOccurred())

promRules, err = rules.BuildPrometheusRule(namespace, &metav1.OwnerReference{})
Expect(err).ToNot(HaveOccurred())

cl = commontestutils.InitClient([]client.Object{})
mgr, err := commontestutils.NewManagerMock(&rest.Config{}, manager.Options{}, cl, logger)
Expect(err).ToNot(HaveOccurred())

reconciler = observability.NewReconciler(mgr, namespace, &appsv1.Deployment{})
})

It("Should create new PrometheusRules", func() {
Expect(reconciler.ReconcileAlerts(context.TODO())).To(Succeed())

var foundPromRules promv1.PrometheusRule
err := cl.Get(context.TODO(), client.ObjectKeyFromObject(promRules), &foundPromRules)
Expect(err).ToNot(HaveOccurred())

Expect(foundPromRules.Spec).To(Equal(promRules.Spec))
})

It("Should update PrometheusRules with diffs", func() {
diffPromRules := promRules.DeepCopy()
diffPromRules.Spec.Groups[0].Rules[0].Expr = intstr.FromString("1")
err := cl.Create(context.TODO(), diffPromRules)
Expect(err).ToNot(HaveOccurred())

Expect(reconciler.ReconcileAlerts(context.TODO())).To(Succeed())

var foundPromRules promv1.PrometheusRule
err = cl.Get(context.TODO(), client.ObjectKeyFromObject(promRules), &foundPromRules)
Expect(err).ToNot(HaveOccurred())

Expect(foundPromRules.Spec).To(Equal(promRules.Spec))
})
})
66 changes: 58 additions & 8 deletions controllers/observability/observability_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,23 @@ package observability

import (
"context"
"fmt"
"time"

appsv1 "k8s.io/api/apps/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/rest"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/source"

"github.com/kubevirt/hyperconverged-cluster-operator/pkg/alertmanager"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/observability/rules"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/util"
)

var (
Expand All @@ -21,33 +27,62 @@ var (
)

type Reconciler struct {
config *rest.Config
events chan event.GenericEvent
client.Client

namespace string
config *rest.Config
events chan event.GenericEvent
owner *metav1.OwnerReference

amApi *alertmanager.Api
}

func (r *Reconciler) Reconcile(_ context.Context, _ ctrl.Request) (ctrl.Result, error) {
func (r *Reconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) {
log.Info("Reconciling Observability")

var errors []error

if err := r.ensurePodDisruptionBudgetAtLimitIsSilenced(); err != nil {
errors = append(errors, err)
}

if err := r.ReconcileAlerts(ctx); err != nil {
errors = append(errors, err)
}

if len(errors) > 0 {
err := fmt.Errorf("reconciliation failed: %v", errors)
log.Error(err, "Reconciliation failed")
return ctrl.Result{}, err
}

return ctrl.Result{}, nil
}

func NewReconciler(config *rest.Config) *Reconciler {
func NewReconciler(mgr ctrl.Manager, namespace string, ownerDeployment *appsv1.Deployment) *Reconciler {
return &Reconciler{
config: config,
events: make(chan event.GenericEvent, 1),
Client: mgr.GetClient(),
namespace: namespace,
config: mgr.GetConfig(),
events: make(chan event.GenericEvent, 1),
owner: buildOwnerReference(ownerDeployment),
}
}

func SetupWithManager(mgr ctrl.Manager) error {
func SetupWithManager(mgr ctrl.Manager, ownerDeployment *appsv1.Deployment) error {
log.Info("Setting up controller")

r := NewReconciler(mgr.GetConfig())
namespace, err := util.GetOperatorNamespaceFromEnv()
if err != nil {
return fmt.Errorf("failed to get operator namespace: %v", err)
}

err = rules.SetupRules()
if err != nil {
return fmt.Errorf("failed to setup Prometheus rules: %v", err)
}

r := NewReconciler(mgr, namespace, ownerDeployment)
r.startEventLoop()

return ctrl.NewControllerManagedBy(mgr).
Expand All @@ -74,3 +109,18 @@ func (r *Reconciler) startEventLoop() {
}
}()
}

func buildOwnerReference(ownerDeployment *appsv1.Deployment) *metav1.OwnerReference {
if ownerDeployment == nil {
return nil
}

return &metav1.OwnerReference{
APIVersion: appsv1.SchemeGroupVersion.String(),
Kind: "Deployment",
Name: ownerDeployment.GetName(),
UID: ownerDeployment.GetUID(),
BlockOwnerDeletion: ptr.To(false),
Controller: ptr.To(false),
}
}
Loading

0 comments on commit fbf6052

Please sign in to comment.