Skip to content

Commit

Permalink
Fix health impact values in health status metrics (#3188)
Browse files Browse the repository at this point in the history
Signed-off-by: João Vilaça <[email protected]>
  • Loading branch information
machadovilaca authored Dec 9, 2024
1 parent c9bc8dc commit 872a061
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 27 deletions.
2 changes: 1 addition & 1 deletion hack/prom-rule-ci/prom-rules-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,7 @@ tests:
input_series:
- series: 'kubevirt_hco_system_health_status'
# time: 0 1 2 3 4 5 6 7 8 9 10 11
values: "0 0 0 0 1 1 1 1 2 2 2 2"
values: "1 1 1 1 2 2 2 2 3 3 3 3"
- series: 'ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="warning"}'
# time: 0 1 2 3 4 5 6 7 8 9 10 11
values: "1 1 stale stale 1 1 stale stale 1 1 stale stale"
Expand Down
28 changes: 27 additions & 1 deletion pkg/monitoring/rules/recordingrules/operator.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
package recordingrules

import (
"fmt"

"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
"github.com/machadovilaca/operator-observability/pkg/operatorrules"
"k8s.io/apimachinery/pkg/util/intstr"

"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics"
)

const (
NoImpact float64 = iota
WarningImpact
CriticalImpact
)

var operatorRecordingRules = []operatorrules.RecordingRule{
Expand All @@ -13,7 +23,7 @@ var operatorRecordingRules = []operatorrules.RecordingRule{
Help: "Indicates whether HCO and its secondary resources health status is healthy (0), warning (1) or critical (2), based both on the firing alerts that impact the operator health, and on kubevirt_hco_system_health_status metric",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString(`label_replace(vector(2) and on() ((kubevirt_hco_system_health_status>1) or (count(ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="critical"})>0)) or (vector(1) and on() ((kubevirt_hco_system_health_status==1) or (count(ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="warning"})>0))) or vector(0),"name","kubevirt-hyperconverged","","")`),
Expr: buildOperatorHealthStatusExpr(),
},
{
MetricsOpts: operatormetrics.MetricOpts{
Expand All @@ -32,3 +42,19 @@ var operatorRecordingRules = []operatorrules.RecordingRule{
Expr: intstr.FromString(`sum by (container, reason)(kubevirt_memory_delta_from_requested_bytes)`),
},
}

func buildOperatorHealthStatusExpr() intstr.IntOrString {
criticalExpr := fmt.Sprintf(
`(vector(%d) and on() ((kubevirt_hco_system_health_status==%d) or (count(ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="critical"})>0)))`,
int64(CriticalImpact), int64(metrics.SystemHealthStatusError),
)

warningExpr := fmt.Sprintf(
`(vector(%d) and on() ((kubevirt_hco_system_health_status==%d) or (count(ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="warning"})>0)))`,
int64(WarningImpact), int64(metrics.SystemHealthStatusWarning),
)

healthyExpr := fmt.Sprintf("vector(%d)", int64(NoImpact))

return intstr.FromString("label_replace(" + criticalExpr + " or " + warningExpr + " or " + healthyExpr + `,"name","kubevirt-hyperconverged","","")`)
}
34 changes: 9 additions & 25 deletions tests/func-tests/monitoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ import (
"crypto/tls"
"flag"
"fmt"
"math"
"net/http"
"strconv"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

openshiftroutev1 "github.com/openshift/api/route/v1"
deschedulerv1 "github.com/openshift/cluster-kube-descheduler-operator/pkg/apis/descheduler/v1"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
Expand All @@ -32,18 +32,13 @@ import (
kubevirtcorev1 "kubevirt.io/api/core/v1"

hcoalerts "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/rules/alerts"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/rules/recordingrules"
hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util"
tests "github.com/kubevirt/hyperconverged-cluster-operator/tests/func-tests"
)

var runbookClient = http.DefaultClient

const (
noneImpact float64 = iota
warningImpact
criticalImpact
)

var _ = Describe("[crit:high][vendor:[email protected]][level:system]Monitoring", Serial, Ordered, Label(tests.OpenshiftLabel, "monitoring"), func() {
flag.Parse()

Expand Down Expand Up @@ -76,7 +71,7 @@ var _ = Describe("[crit:high][vendor:[email protected]][level:system]Monitoring"
Expect(err).NotTo(HaveOccurred())
})

It("Alert rules should have all the requried annotations", func() {
It("Alert rules should have all the required annotations", func() {
for _, group := range prometheusRule.Spec.Groups {
for _, rule := range group.Rules {
if rule.Alert != "" {
Expand Down Expand Up @@ -171,7 +166,7 @@ var _ = Describe("[crit:high][vendor:[email protected]][level:system]Monitoring"
return alert
}).WithTimeout(60 * time.Second).WithPolling(time.Second).WithContext(ctx).ShouldNot(BeNil())

verifyOperatorHealthMetricValue(ctx, promClient, hcoClient, initialOperatorHealthMetricValue, warningImpact)
verifyOperatorHealthMetricValue(ctx, promClient, initialOperatorHealthMetricValue, recordingrules.WarningImpact)
})

It("UnsupportedHCOModification alert should fired when there is an jsonpatch annotation to modify an operand CRs", func(ctx context.Context) {
Expand All @@ -189,7 +184,7 @@ var _ = Describe("[crit:high][vendor:[email protected]][level:system]Monitoring"
alert := getAlertByName(alerts, "UnsupportedHCOModification")
return alert
}).WithTimeout(60 * time.Second).WithPolling(time.Second).WithContext(ctx).ShouldNot(BeNil())
verifyOperatorHealthMetricValue(ctx, promClient, hcoClient, initialOperatorHealthMetricValue, warningImpact)
verifyOperatorHealthMetricValue(ctx, promClient, initialOperatorHealthMetricValue, recordingrules.WarningImpact)
})

Describe("KubeDescheduler", Serial, Ordered, Label(tests.OpenshiftLabel, "monitoring"), func() {
Expand Down Expand Up @@ -293,7 +288,7 @@ var _ = Describe("[crit:high][vendor:[email protected]][level:system]Monitoring"
return alert
}).WithTimeout(60 * time.Second).WithPolling(time.Second).WithContext(ctx).ShouldNot(BeNil())

verifyOperatorHealthMetricValue(ctx, promClient, hcoClient, initialOperatorHealthMetricValue, criticalImpact)
verifyOperatorHealthMetricValue(ctx, promClient, initialOperatorHealthMetricValue, recordingrules.CriticalImpact)

By("Correctly configuring the descheduler for KubeVirt")
Expect(cli.Patch(ctx, descheduler, patchConfigure)).To(Succeed())
Expand Down Expand Up @@ -367,8 +362,7 @@ var _ = Describe("[crit:high][vendor:[email protected]][level:system]Monitoring"
return alert
}).WithTimeout(60 * time.Second).WithPolling(time.Second).WithContext(ctx).ShouldNot(BeNil())

verifyOperatorHealthMetricValue(ctx, promClient, hcoClient, initialOperatorHealthMetricValue, criticalImpact)

verifyOperatorHealthMetricValue(ctx, promClient, initialOperatorHealthMetricValue, recordingrules.CriticalImpact)
})
})

Expand All @@ -383,22 +377,12 @@ func getAlertByName(alerts promApiv1.AlertsResult, alertName string) *promApiv1.
return nil
}

func verifyOperatorHealthMetricValue(ctx context.Context, promClient promApiv1.API, hcoClient *tests.HCOPrometheusClient, initialOperatorHealthMetricValue, alertImpact float64) {
func verifyOperatorHealthMetricValue(ctx context.Context, promClient promApiv1.API, initialOperatorHealthMetricValue, alertImpact float64) {
Eventually(func(g Gomega, ctx context.Context) {
if alertImpact >= initialOperatorHealthMetricValue {
systemHealthMetricValue, err := hcoClient.GetHCOMetric(ctx, "kubevirt_hco_system_health_status")
g.Expect(err).NotTo(HaveOccurred())

operatorHealthMetricValue := getMetricValue(ctx, promClient, "kubevirt_hyperconverged_operator_health_status")

expectedOperatorHealthMetricValue := math.Max(alertImpact, systemHealthMetricValue)

g.Expect(operatorHealthMetricValue).To(Equal(expectedOperatorHealthMetricValue),
"kubevirt_hyperconverged_operator_health_status value is %f, but its expected value is %f, "+
"while kubevirt_hco_system_health_status value is %f.",
operatorHealthMetricValue, expectedOperatorHealthMetricValue, systemHealthMetricValue)
g.Expect(operatorHealthMetricValue).To(Equal(alertImpact))
}

}).WithTimeout(60 * time.Second).WithPolling(5 * time.Second).WithContext(ctx).Should(Succeed())
}

Expand Down

0 comments on commit 872a061

Please sign in to comment.