Skip to content

Commit

Permalink
Fix OperatorConditionsUnhealthy alert expression metric value (#3181)
Browse files Browse the repository at this point in the history
Signed-off-by: João Vilaça <[email protected]>
  • Loading branch information
machadovilaca authored Nov 28, 2024
1 parent dcffe2f commit 9677555
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 15 deletions.
15 changes: 7 additions & 8 deletions hack/prom-rule-ci/prom-rules-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -774,22 +774,21 @@ tests:
# Test OperatorConditionsUnhealthy
- interval: 1m
input_series:
- series: 'kubevirt_hco_system_health_status{reason="healthy"}'
- values: "stale 1 stale stale stale"

- series: 'kubevirt_hco_system_health_status{reason="SOME_ERROR"}'
values: "stale stale 2 stale"
values: "stale stale stale 3 stale"

- series: 'kubevirt_hco_system_health_status{reason="SOME_WARNING"}'
values: "stale stale stale stale 1 stale"
values: "stale stale stale stale stale 2 stale"

alert_rule_test:
- eval_time: 1m
alertname: OperatorConditionsUnhealthy
exp_alerts: [ ]

- eval_time: 1m
alertname: OperatorConditionsUnhealthy
exp_alerts: [ ]

- eval_time: 2m
- eval_time: 3m
alertname: OperatorConditionsUnhealthy
exp_alerts:
- exp_annotations:
Expand All @@ -803,7 +802,7 @@ tests:
kubernetes_operator_component: "hyperconverged-cluster-operator"
reason: "SOME_ERROR"

- eval_time: 4m
- eval_time: 5m
alertname: OperatorConditionsUnhealthy
exp_alerts:
- exp_annotations:
Expand Down
9 changes: 4 additions & 5 deletions pkg/monitoring/metrics/operator_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ const (
)

const (
SystemHealthStatusUnknown float64 = 0
SystemHealthStatusHealthy float64 = iota
SystemHealthStatusUnknown float64 = iota
SystemHealthStatusHealthy
SystemHealthStatusWarning
SystemHealthStatusError
)
Expand Down Expand Up @@ -138,12 +138,11 @@ func SetHCOSystemError(reason string) {
func GetHCOMetricSystemHealthStatus(reason string) (float64, error) {
dto := &ioprometheusclient.Metric{}
err := systemHealthStatus.WithLabelValues(reason).Write(dto)
value := dto.Gauge.GetValue()

if err != nil {
return SystemHealthStatusUnknown, err
}
return value, nil

return dto.Gauge.GetValue(), nil
}

func getLabelsForObj(kind string, name string) string {
Expand Down
8 changes: 6 additions & 2 deletions pkg/monitoring/rules/alerts/health_alerts.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
package alerts

import (
"fmt"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"

"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics"
)

func healthAlerts() []promv1.Rule {
return []promv1.Rule{
{
Alert: "OperatorConditionsUnhealthy",
Expr: intstr.FromString("kubevirt_hco_system_health_status == 2"),
Expr: intstr.FromString(fmt.Sprintf("kubevirt_hco_system_health_status == %f", metrics.SystemHealthStatusError)),
Annotations: map[string]string{
"description": "HCO and its secondary resources are in a critical state due to {{ $labels.reason }}.",
"summary": "HCO and its secondary resources are in a critical state.",
Expand All @@ -21,7 +25,7 @@ func healthAlerts() []promv1.Rule {
},
{
Alert: "OperatorConditionsUnhealthy",
Expr: intstr.FromString("kubevirt_hco_system_health_status == 1"),
Expr: intstr.FromString(fmt.Sprintf("kubevirt_hco_system_health_status == %f", metrics.SystemHealthStatusWarning)),
Annotations: map[string]string{
"description": "HCO and its secondary resources are in a warning state due to {{ $labels.reason }}.",
"summary": "HCO and its secondary resources are in a warning state.",
Expand Down

0 comments on commit 9677555

Please sign in to comment.