From 4dd657bc2baac4466a2dcb4f186e3f8e3cf698ea Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Wed, 28 Aug 2024 12:07:55 +0200 Subject: [PATCH] minor changes for consistency --- resources/prometheus/prometheus-rules.yaml | 20 ++++++++ .../unit_tests/CentralInstanceLimit.yaml | 48 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 resources/prometheus/unit_tests/CentralInstanceLimit.yaml diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 6521231..6b393c1 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -136,6 +136,16 @@ spec: annotations: summary: "Certificate expiring very soon: `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." description: "Certificate `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on {{ humanizeTimestamp $value}}." + - alert: CentralInstanceLimitCriticalCapacity + expr: | + acs_fleet_manager_cluster_status_capacity_max-acs_fleet_manager_cluster_status_capacity_used<=2 + for: 5m + labels: + severity: critical + annotations: + summary: "Cluster ID: '{{ $labels.clusterID }}' is at a critical instance limit." + description: "Cluster '{{ $labels.clusterID }}' has '{{ $value }}' instances left before reaching limit" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-031-central-instance-limit-reached.md" - alert: RHACSFleetshardCertificateExpiryWarning expr: | acs_fleetshard_certificate_expiration_timestamp <= 7* 24 * 60 * 60 + time() @@ -144,6 +154,16 @@ spec: annotations: summary: "Certificate expiring soon: `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." description: "Certificate `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on {{ humanizeTimestamp $value}}." + - alert: CentralInstanceLimitWarningCapacity + expr: | + acs_fleet_manager_cluster_status_capacity_max-acs_fleet_manager_cluster_status_capacity_used<=10 + for: 5m + labels: + severity: warning + annotations: + summary: "Cluster ID: '{{ $labels.clusterID }}' is reaching its instance limit." + description: "Cluster '{{ $labels.clusterID }}' has '{{ $value }}' instances left before reaching limit" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-031-central-instance-limit-reached.md" - alert: RHACSFleetshardSyncReconciliationErrors expr: | acs_fleetshard_central_errors_per_reconciliations:ratio_rate10m > 0.10 diff --git a/resources/prometheus/unit_tests/CentralInstanceLimit.yaml b/resources/prometheus/unit_tests/CentralInstanceLimit.yaml new file mode 100644 index 0000000..4379ac9 --- /dev/null +++ b/resources/prometheus/unit_tests/CentralInstanceLimit.yaml @@ -0,0 +1,48 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + + +evaluation_interval: 1m + + +tests: + - interval: 15s + input_series: + - series: acs_fleet_manager_cluster_status_capacity_max{clusterID="cluster1"} + values: 10 #critical + - series: acs_fleet_manager_cluster_status_capacity_used{clusterID="cluster1"} + values: 8 #critical + - series: acs_fleet_manager_cluster_status_capacity_max{clusterID="cluster-2"} + values: 15 #warning + - series: acs_fleet_manager_cluster_status_capacity_used{clusterID="cluster-2"} + values: 5 #warning + alert_rule_test: + - eval_time: 5m + alertname: CentralInstanceLimitCriticalCapacity + exp_alerts: + - exp_labels: + alertname: CentralInstanceLimitCriticalCapacity + clusterID: cluster1 + severity: critical + exp_annotations: + summary: "Cluster ID: 'cluster1' is at a critical instance limit." + description: "Cluster 'cluster1' has '2' instances left before reaching limit" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-031-central-instance-limit-reached.md" + - interval: 15s + input_series: + - series: acs_fleet_manager_cluster_status_capacity_max{clusterID="cluster-2"} + values: 15 #warning + - series: acs_fleet_manager_cluster_status_capacity_used{clusterID="cluster-2"} + values: 5 #warning + alert_rule_test: + - eval_time: 5m + alertname: CentralInstanceLimitWarningCapacity + exp_alerts: + - exp_labels: + alertname: CentralInstanceLimitWarningCapacity + clusterID: cluster-2 + severity: warning + exp_annotations: + summary: "Cluster ID: 'cluster-2' is reaching its instance limit." + description: "Cluster 'cluster-2' has '10' instances left before reaching limit" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-031-central-instance-limit-reached.md"