From 82e272c2169624cf599448c4127b6edfdbaff949 Mon Sep 17 00:00:00 2001 From: Yury Kovalev Date: Tue, 9 Apr 2024 14:14:55 +0200 Subject: [PATCH] ROX-16615: Remove the probe service from the Data Plane observability resources --- ...cluster-resource-adjustment-configmap.yaml | 20 ++++----- ...cluster-resource-adjustment-dashboard.yaml | 20 ++++----- .../rhacs-cluster-resource-adjustment.json | 20 ++++----- resources/index.json | 1 - .../pod_monitors/rhacs-probe-metrics.yaml | 36 ---------------- resources/prometheus/prometheus-rules.yaml | 42 ------------------- .../unit_tests/RHACSProbeContainerDown.yaml | 27 ------------ ...ACSProbeContainerFrequentlyRestarting.yaml | 27 ------------ .../unit_tests/RHACSProbeRunFailed.yaml | 31 -------------- .../unit_tests/RHACSProbeScrapeFailed.yaml | 30 ------------- 10 files changed, 30 insertions(+), 224 deletions(-) delete mode 100644 resources/prometheus/pod_monitors/rhacs-probe-metrics.yaml delete mode 100644 resources/prometheus/unit_tests/RHACSProbeContainerDown.yaml delete mode 100644 resources/prometheus/unit_tests/RHACSProbeContainerFrequentlyRestarting.yaml delete mode 100644 resources/prometheus/unit_tests/RHACSProbeRunFailed.yaml delete mode 100644 resources/prometheus/unit_tests/RHACSProbeScrapeFailed.yaml diff --git a/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-configmap.yaml b/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-configmap.yaml index 9be79975..76e8aa92 100644 --- a/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-configmap.yaml +++ b/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-configmap.yaml @@ -4381,7 +4381,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "100%ile", "range": true, @@ -4394,7 +4394,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(.95, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(.95, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "95%ile", @@ -4408,7 +4408,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.50, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.50, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "50%ile", @@ -4422,7 +4422,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "10%ile", @@ -4524,7 +4524,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "100%ile", "range": true, @@ -4537,7 +4537,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.95, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.95, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "95%ile", @@ -4551,7 +4551,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.5, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.5, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "50%ile", @@ -4565,7 +4565,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "10%ile", @@ -4680,7 +4680,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "topk(5, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "topk(5, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "__auto", "range": true, @@ -4781,7 +4781,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "topk(5, sum by(namespace) (avg_over_time(container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}[6h])))", + "expr": "topk(5, sum by(namespace) (avg_over_time(container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}[6h])))", "format": "time_series", "instant": true, "legendFormat": "__auto", diff --git a/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-dashboard.yaml b/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-dashboard.yaml index e25f4384..dae40e59 100644 --- a/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-dashboard.yaml +++ b/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-dashboard.yaml @@ -4381,7 +4381,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "100%ile", "range": true, @@ -4394,7 +4394,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(.95, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(.95, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "95%ile", @@ -4408,7 +4408,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.50, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.50, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "50%ile", @@ -4422,7 +4422,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "10%ile", @@ -4524,7 +4524,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "100%ile", "range": true, @@ -4537,7 +4537,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.95, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.95, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "95%ile", @@ -4551,7 +4551,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.5, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.5, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "50%ile", @@ -4565,7 +4565,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "10%ile", @@ -4680,7 +4680,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "topk(5, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "topk(5, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "__auto", "range": true, @@ -4781,7 +4781,7 @@ spec: }, "editorMode": "code", "exemplar": false, - "expr": "topk(5, sum by(namespace) (avg_over_time(container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}[6h])))", + "expr": "topk(5, sum by(namespace) (avg_over_time(container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}[6h])))", "format": "time_series", "instant": true, "legendFormat": "__auto", diff --git a/resources/grafana/sources/rhacs-cluster-resource-adjustment.json b/resources/grafana/sources/rhacs-cluster-resource-adjustment.json index 407c98b6..a7ea08ca 100644 --- a/resources/grafana/sources/rhacs-cluster-resource-adjustment.json +++ b/resources/grafana/sources/rhacs-cluster-resource-adjustment.json @@ -4370,7 +4370,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "100%ile", "range": true, @@ -4383,7 +4383,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(.95, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(.95, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "95%ile", @@ -4397,7 +4397,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.50, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.50, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "50%ile", @@ -4411,7 +4411,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "10%ile", @@ -4513,7 +4513,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "100%ile", "range": true, @@ -4526,7 +4526,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.95, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.95, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "95%ile", @@ -4540,7 +4540,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.5, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.5, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "50%ile", @@ -4554,7 +4554,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "10%ile", @@ -4669,7 +4669,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "topk(5, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "topk(5, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "__auto", "range": true, @@ -4770,7 +4770,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "topk(5, sum by(namespace) (avg_over_time(container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}[6h])))", + "expr": "topk(5, sum by(namespace) (avg_over_time(container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}[6h])))", "format": "time_series", "instant": true, "legendFormat": "__auto", diff --git a/resources/index.json b/resources/index.json index b843e500..200262fc 100644 --- a/resources/index.json +++ b/resources/index.json @@ -7,7 +7,6 @@ "prometheus/pod_monitors/rhacs-central-metrics.yaml", "prometheus/pod_monitors/rhacs-cloudwatch-exporter.yaml", "prometheus/pod_monitors/rhacs-fleetshard-sync-metrics.yaml", - "prometheus/pod_monitors/rhacs-probe-metrics.yaml", "prometheus/pod_monitors/rhacs-scanner-metrics.yaml" ], "rules": [ diff --git a/resources/prometheus/pod_monitors/rhacs-probe-metrics.yaml b/resources/prometheus/pod_monitors/rhacs-probe-metrics.yaml deleted file mode 100644 index 1b9fbc6e..00000000 --- a/resources/prometheus/pod_monitors/rhacs-probe-metrics.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: rhacs-probe-metrics - labels: - app: rhacs -spec: - selector: - matchLabels: - app: "probe" - namespaceSelector: - any: true - podMetricsEndpoints: - - path: /metrics - port: monitoring - relabelings: - - action: labeldrop - regex: endpoint - - - sourceLabels: [container] - action: replace - targetLabel: job - - - action: labelmap - regex: __meta_kubernetes_pod_annotation_rhacs_redhat_com_(.+) - replacement: rhacs_${1} - - - action: labelmap - regex: __meta_kubernetes_pod_label_rhacs_redhat_com_(.+) - replacement: rhacs_${1} - - - sourceLabels: [rhacs_tenant] - targetLabel: rhacs_instance_id - - - action: labeldrop - regex: rhacs_tenant diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 1927af74..aa8d82c7 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -321,48 +321,6 @@ spec: description: 'The maximum send rate over the last hour is {{ $value }} messages/second, which is dangerously approaching the maximum limit of 14 per second.' sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-040-aws-ses-violation.md" - - - name: rhacs-probe - rules: - - alert: RHACSProbeRunFailed - expr: acs_probe_last_failure_timestamp > 0 and acs_probe_last_failure_timestamp >= acs_probe_last_success_timestamp - for: 30m - labels: - severity: critical - annotations: - summary: "The latest probe run failed at `{{ $value | humanizeTimestamp }}`." - description: "The latest run of probe `{{ $labels.pod }}` failed at `{{ $value | humanizeTimestamp }}`." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-008-probe-run-failed.md" - - alert: RHACSProbeScrapeFailed - expr: | - avg_over_time(up{job="probe"}[10m]) < 0.5 and ON(pod) kube_pod_container_status_ready{container="probe"} == 1 - for: 20m - labels: - severity: critical - annotations: - summary: "Prometheus unable to scrape metrics from target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}`." - description: "During the last 10 minutes, only `{{ $value | humanizePercentage }}` of scrapes of target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}` were successful. This alert is raised when less than 50% of scrapes are successful." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-009-probe-unavailable.md" - - alert: RHACSProbeContainerDown - expr: | - avg_over_time(kube_pod_container_status_ready{container="probe"}[10m]) < 0.5 - for: 20m - labels: - severity: critical - annotations: - summary: "Probe container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` is down or in a CrashLoopBackOff status." - description: "Probe container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has been down or in a CrashLoopBackOff status for at least 10 minutes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-009-probe-unavailable.md" - - alert: RHACSProbeContainerFrequentlyRestarting - expr: | - increase(kube_pod_container_status_restarts_total{container="probe"}[30m]) > 3 - labels: - severity: critical - annotations: - summary: "Probe container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times." - description: "Probe container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-009-probe-unavailable.md" - - name: deadmanssnitch rules: - alert: DeadMansSwitch diff --git a/resources/prometheus/unit_tests/RHACSProbeContainerDown.yaml b/resources/prometheus/unit_tests/RHACSProbeContainerDown.yaml deleted file mode 100644 index 73f66d4e..00000000 --- a/resources/prometheus/unit_tests/RHACSProbeContainerDown.yaml +++ /dev/null @@ -1,27 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: kube_pod_container_status_ready{namespace="rhacs-1234", pod="probe-1234", container="probe"} - values: "1+0x10 0+0x50" - alert_rule_test: - - eval_time: 15m - alertname: RHACSProbeContainerDown - exp_alerts: [] - - eval_time: 40m - alertname: RHACSProbeContainerDown - exp_alerts: - - exp_labels: - alertname: RHACSProbeContainerDown - pod: probe-1234 - container: probe - namespace: rhacs-1234 - severity: critical - exp_annotations: - summary: "Probe container `probe-1234/probe` in namespace `rhacs-1234` is down or in a CrashLoopBackOff status." - description: "Probe container `probe-1234/probe` in namespace `rhacs-1234` has been down or in a CrashLoopBackOff status for at least 10 minutes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-009-probe-unavailable.md" diff --git a/resources/prometheus/unit_tests/RHACSProbeContainerFrequentlyRestarting.yaml b/resources/prometheus/unit_tests/RHACSProbeContainerFrequentlyRestarting.yaml deleted file mode 100644 index 21ed7f61..00000000 --- a/resources/prometheus/unit_tests/RHACSProbeContainerFrequentlyRestarting.yaml +++ /dev/null @@ -1,27 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: kube_pod_container_status_restarts_total{namespace="rhacs-1234", pod="probe-1234-5678", container="probe"} - values: "0+0x10 1+1x10 4+1x20" - alert_rule_test: - - eval_time: 10m - alertname: RHACSProbeContainerFrequentlyRestarting - exp_alerts: [] - - eval_time: 30m - alertname: RHACSProbeContainerFrequentlyRestarting - exp_alerts: - - exp_labels: - alertname: RHACSProbeContainerFrequentlyRestarting - container: probe - namespace: rhacs-1234 - pod: probe-1234-5678 - severity: critical - exp_annotations: - summary: "Probe container `probe-1234-5678/probe` in namespace `rhacs-1234` restarted more than 3 times." - description: "Probe container `probe-1234-5678/probe` in namespace `rhacs-1234` has restarted more than 3 times during the last 30 minutes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-009-probe-unavailable.md" diff --git a/resources/prometheus/unit_tests/RHACSProbeRunFailed.yaml b/resources/prometheus/unit_tests/RHACSProbeRunFailed.yaml deleted file mode 100644 index 6d2bfb8b..00000000 --- a/resources/prometheus/unit_tests/RHACSProbeRunFailed.yaml +++ /dev/null @@ -1,31 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: acs_probe_last_failure_timestamp{namespace="rhacs-probe", pod="probe-1234"} - values: "0+0x10 0+0x15 7+0x60" - - series: acs_probe_last_success_timestamp{namespace="rhacs-probe", pod="probe-1234"} - values: "0+0x10 1+1x15 6+0x60" - alert_rule_test: - - eval_time: 0m - alertname: RHACSProbeRunFailed - exp_alerts: [] - - eval_time: 30m - alertname: RHACSProbeRunFailed - exp_alerts: [] - - eval_time: 60m - alertname: RHACSProbeRunFailed - exp_alerts: - - exp_labels: - alertname: RHACSProbeRunFailed - severity: critical - namespace: rhacs-probe - pod: probe-1234 - exp_annotations: - summary: "The latest probe run failed at `1970-01-01 00:00:07 +0000 UTC`." - description: "The latest run of probe `probe-1234` failed at `1970-01-01 00:00:07 +0000 UTC`." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-008-probe-run-failed.md" diff --git a/resources/prometheus/unit_tests/RHACSProbeScrapeFailed.yaml b/resources/prometheus/unit_tests/RHACSProbeScrapeFailed.yaml deleted file mode 100644 index 70106a69..00000000 --- a/resources/prometheus/unit_tests/RHACSProbeScrapeFailed.yaml +++ /dev/null @@ -1,30 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: up{namespace="rhacs-1234", pod="probe-1234-5678", job="probe", instance="1.2.3.4:9090"} - values: "0+0x20 1+0x20" - - series: kube_pod_container_status_ready{namespace="rhacs-1234", pod="probe-1234-5678", container="probe"} - values: "1+0x40" - alert_rule_test: - - eval_time: 10m - alertname: RHACSProbeScrapeFailed - exp_alerts: [] - - eval_time: 25m - alertname: RHACSProbeScrapeFailed - exp_alerts: - - exp_labels: - alertname: RHACSProbeScrapeFailed - instance: 1.2.3.4:9090 - namespace: rhacs-1234 - pod: probe-1234-5678 - severity: critical - job: probe - exp_annotations: - summary: "Prometheus unable to scrape metrics from target `probe-1234-5678` in namespace `rhacs-1234`." - description: "During the last 10 minutes, only `45.45%` of scrapes of target `probe-1234-5678` in namespace `rhacs-1234` were successful. This alert is raised when less than 50% of scrapes are successful." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-009-probe-unavailable.md"