diff --git a/hack/prom-rule-ci/prom-rules-tests.yaml b/hack/prom-rule-ci/prom-rules-tests.yaml index 7f306c9e0..a4f3baf8b 100755 --- a/hack/prom-rule-ci/prom-rules-tests.yaml +++ b/hack/prom-rule-ci/prom-rules-tests.yaml @@ -774,22 +774,21 @@ tests: # Test OperatorConditionsUnhealthy - interval: 1m input_series: + - series: 'kubevirt_hco_system_health_status{reason="healthy"}' + - values: "stale 1 stale stale stale" + - series: 'kubevirt_hco_system_health_status{reason="SOME_ERROR"}' - values: "stale stale 2 stale" + values: "stale stale stale 3 stale" - series: 'kubevirt_hco_system_health_status{reason="SOME_WARNING"}' - values: "stale stale stale stale 1 stale" + values: "stale stale stale stale stale 2 stale" alert_rule_test: - eval_time: 1m alertname: OperatorConditionsUnhealthy exp_alerts: [ ] - - eval_time: 1m - alertname: OperatorConditionsUnhealthy - exp_alerts: [ ] - - - eval_time: 2m + - eval_time: 3m alertname: OperatorConditionsUnhealthy exp_alerts: - exp_annotations: @@ -803,7 +802,7 @@ tests: kubernetes_operator_component: "hyperconverged-cluster-operator" reason: "SOME_ERROR" - - eval_time: 4m + - eval_time: 5m alertname: OperatorConditionsUnhealthy exp_alerts: - exp_annotations: diff --git a/pkg/monitoring/metrics/operator_metrics.go b/pkg/monitoring/metrics/operator_metrics.go index ad3e68821..2190659ef 100644 --- a/pkg/monitoring/metrics/operator_metrics.go +++ b/pkg/monitoring/metrics/operator_metrics.go @@ -16,8 +16,8 @@ const ( ) const ( - SystemHealthStatusUnknown float64 = 0 - SystemHealthStatusHealthy float64 = iota + SystemHealthStatusUnknown float64 = iota + SystemHealthStatusHealthy SystemHealthStatusWarning SystemHealthStatusError ) @@ -138,12 +138,11 @@ func SetHCOSystemError(reason string) { func GetHCOMetricSystemHealthStatus(reason string) (float64, error) { dto := &ioprometheusclient.Metric{} err := systemHealthStatus.WithLabelValues(reason).Write(dto) - value := dto.Gauge.GetValue() - if err != nil { return SystemHealthStatusUnknown, err } - return value, nil + + return dto.Gauge.GetValue(), nil } func getLabelsForObj(kind string, name string) string { diff --git a/pkg/monitoring/rules/alerts/health_alerts.go b/pkg/monitoring/rules/alerts/health_alerts.go index dcc3d2771..8ab62dfd6 100644 --- a/pkg/monitoring/rules/alerts/health_alerts.go +++ b/pkg/monitoring/rules/alerts/health_alerts.go @@ -1,15 +1,19 @@ package alerts import ( + "fmt" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "k8s.io/apimachinery/pkg/util/intstr" + + "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics" ) func healthAlerts() []promv1.Rule { return []promv1.Rule{ { Alert: "OperatorConditionsUnhealthy", - Expr: intstr.FromString("kubevirt_hco_system_health_status == 2"), + Expr: intstr.FromString(fmt.Sprintf("kubevirt_hco_system_health_status == %f", metrics.SystemHealthStatusError)), Annotations: map[string]string{ "description": "HCO and its secondary resources are in a critical state due to {{ $labels.reason }}.", "summary": "HCO and its secondary resources are in a critical state.", @@ -21,7 +25,7 @@ func healthAlerts() []promv1.Rule { }, { Alert: "OperatorConditionsUnhealthy", - Expr: intstr.FromString("kubevirt_hco_system_health_status == 1"), + Expr: intstr.FromString(fmt.Sprintf("kubevirt_hco_system_health_status == %f", metrics.SystemHealthStatusWarning)), Annotations: map[string]string{ "description": "HCO and its secondary resources are in a warning state due to {{ $labels.reason }}.", "summary": "HCO and its secondary resources are in a warning state.",