From 4d205243d879e2c48c1c2514338ab5ea07289291 Mon Sep 17 00:00:00 2001 From: Moritz Clasmeier Date: Fri, 26 Apr 2024 14:03:40 +0200 Subject: [PATCH 1/3] Add SELinux alert --- resources/prometheus/prometheus-rules.yaml | 10 ++++++++ .../ClusterAuditSELinuxViolations.yaml | 23 +++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 resources/prometheus/unit_tests/ClusterAuditSELinuxViolations.yaml diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 94dfbc0d..9449a95d 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -756,3 +756,13 @@ cluster autoscaler. Limits can be adjusted by modifying the cluster autoscaler c the cluster autoscaler. This is calculated by summing the memory capacity for all nodes in the cluster and comparing that number against the maximum memory bytes value set for the cluster autoscaler. Limits can be adjusted by modifying the cluster autoscaler configuration." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-042-modify-cluster-autoscaler.md" + - alert: ClusterAuditSELinuxViolations + expr: | + selinux_denials_sample_count > 0 + labels: + severity: info + annotations: + summary: "SELinux Violations occuring on cluster." + description: | + A cluster node logged {{ $value }} SELinux AVC denial(s) per minute to the audit log. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-043-selinux-violation.md" diff --git a/resources/prometheus/unit_tests/ClusterAuditSELinuxViolations.yaml b/resources/prometheus/unit_tests/ClusterAuditSELinuxViolations.yaml new file mode 100644 index 00000000..07d8374d --- /dev/null +++ b/resources/prometheus/unit_tests/ClusterAuditSELinuxViolations.yaml @@ -0,0 +1,23 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: selinux_denials_sample_count{namespace="rhacs-cloudwatch"} + values: "1x5" + alert_rule_test: + - eval_time: 70s + alertname: ClusterAuditSELinuxViolations + exp_alerts: + - exp_labels: + alertname: ClusterAuditSELinuxViolations + namespace: rhacs-cloudwatch + severity: info + exp_annotations: + summary: "SELinux Violations occuring on cluster." + description: | + A cluster node logged 1 SELinux AVC denial(s) per minute to the audit log. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-043-selinux-violation.md" From 8e3801e1b04429426fd05d85f2923b73277b808f Mon Sep 17 00:00:00 2001 From: Moritz Clasmeier Date: Fri, 26 Apr 2024 14:03:40 +0200 Subject: [PATCH 2/3] Add Network Policy alert --- resources/prometheus/prometheus-rules.yaml | 22 +++++++ .../ClusterAuditNetworkPolicyViolations.yaml | 59 +++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 resources/prometheus/unit_tests/ClusterAuditNetworkPolicyViolations.yaml diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 9449a95d..a9f9f332 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -766,3 +766,25 @@ for the cluster autoscaler. Limits can be adjusted by modifying the cluster auto description: | A cluster node logged {{ $value }} SELinux AVC denial(s) per minute to the audit log. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-043-selinux-violation.md" + - alert: ClusterAuditNetworkPolicyViolations + expr: | + network_policy_denials_sample_count > 0 + for: 10m + labels: + severity: info + annotations: + summary: "Network Policy Violations occuring on cluster." + description: | + A cluster node logged Network Policy ACL denial(s) for 10 minutes. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-044-network-policy-violation.md" + - alert: ClusterAuditNetworkPolicyViolations + expr: | + network_policy_denials_sample_count >= 15 + for: 1m + labels: + severity: info + annotations: + summary: "Network Policy Violations occuring on cluster." + description: | + A cluster node logged at least {{ $value }} Network Policy ACL denial(s) per minute. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-044-network-policy-violation.md" diff --git a/resources/prometheus/unit_tests/ClusterAuditNetworkPolicyViolations.yaml b/resources/prometheus/unit_tests/ClusterAuditNetworkPolicyViolations.yaml new file mode 100644 index 00000000..18825ca3 --- /dev/null +++ b/resources/prometheus/unit_tests/ClusterAuditNetworkPolicyViolations.yaml @@ -0,0 +1,59 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: network_policy_denials_sample_count{namespace="rhacs-cloudwatch"} + values: "15x1" + alert_rule_test: + - eval_time: 70s + alertname: ClusterAuditNetworkPolicyViolations + exp_alerts: + - exp_labels: + alertname: ClusterAuditNetworkPolicyViolations + namespace: rhacs-cloudwatch + severity: info + exp_annotations: + summary: "Network Policy Violations occuring on cluster." + description: | + A cluster node logged at least 15 Network Policy ACL denial(s) per minute. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-044-network-policy-violation.md" + + - interval: 1m + input_series: + - series: network_policy_denials_sample_count{namespace="rhacs-cloudwatch"} + values: "1x10" + alert_rule_test: + - eval_time: 610s + alertname: ClusterAuditNetworkPolicyViolations + exp_alerts: + - exp_labels: + alertname: ClusterAuditNetworkPolicyViolations + namespace: rhacs-cloudwatch + severity: info + exp_annotations: + summary: "Network Policy Violations occuring on cluster." + description: | + A cluster node logged Network Policy ACL denial(s) for 10 minutes. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-044-network-policy-violation.md" + + - interval: 1m + input_series: + - series: network_policy_denials_sample_count{namespace="rhacs-cloudwatch"} + values: "1x9 0" + alert_rule_test: + - eval_time: 10m + alertname: ClusterAuditNetworkPolicyViolations + exp_alerts: [] + + - interval: 1m + input_series: + - series: network_policy_denials_sample_count{namespace="rhacs-cloudwatch"} + values: "14x1" + alert_rule_test: + - eval_time: 70s + alertname: ClusterAuditNetworkPolicyViolations + exp_alerts: [] From d629fc38781c70c19988949cc80fc21c32a79311 Mon Sep 17 00:00:00 2001 From: Ludovic Cleroux Date: Tue, 18 Jun 2024 09:36:43 +0200 Subject: [PATCH 3/3] ROX-24822: Fix acs-cs annotations --- .../generated/dashboards/rhacs-central-slo-configmap.yaml | 4 ++-- .../generated/dashboards/rhacs-central-slo-dashboard.yaml | 4 ++-- .../dashboards/rhacs-cluster-overview-configmap.yaml | 4 ++-- .../dashboards/rhacs-cluster-overview-dashboard.yaml | 4 ++-- resources/grafana/sources/rhacs-central-slo.json | 4 ++-- resources/grafana/sources/rhacs-cluster-overview.json | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/resources/grafana/generated/dashboards/rhacs-central-slo-configmap.yaml b/resources/grafana/generated/dashboards/rhacs-central-slo-configmap.yaml index 01962278..199a9808 100644 --- a/resources/grafana/generated/dashboards/rhacs-central-slo-configmap.yaml +++ b/resources/grafana/generated/dashboards/rhacs-central-slo-configmap.yaml @@ -37,7 +37,7 @@ data: }, "enable": true, "iconColor": "purple", - "expr": "count (count by (git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"}, \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", + "expr": "count (count by (git_version) (label_replace(count_over_time(kubernetes_build_info{job!~\"kube-dns|coredns\"}[${__interval}]), \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", "name": "Kubernetes Upgrade", "textFormat": "Kubernetes Upgrade" }, @@ -48,7 +48,7 @@ data: }, "enable": true, "iconColor": "red", - "expr": "count (count by (gitVersion) (openshift_apiserver_build_info)) > 1", + "expr": "count (count by (gitVersion) (count_over_time (openshift_apiserver_build_info[${__interval}]))) > 1", "name": "OpenShift Upgrade", "textFormat": "OpenShift Upgrade" } diff --git a/resources/grafana/generated/dashboards/rhacs-central-slo-dashboard.yaml b/resources/grafana/generated/dashboards/rhacs-central-slo-dashboard.yaml index ff27bad0..f5649166 100644 --- a/resources/grafana/generated/dashboards/rhacs-central-slo-dashboard.yaml +++ b/resources/grafana/generated/dashboards/rhacs-central-slo-dashboard.yaml @@ -37,7 +37,7 @@ spec: }, "enable": true, "iconColor": "purple", - "expr": "count (count by (git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"}, \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", + "expr": "count (count by (git_version) (label_replace(count_over_time(kubernetes_build_info{job!~\"kube-dns|coredns\"}[${__interval}]), \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", "name": "Kubernetes Upgrade", "textFormat": "Kubernetes Upgrade" }, @@ -48,7 +48,7 @@ spec: }, "enable": true, "iconColor": "red", - "expr": "count (count by (gitVersion) (openshift_apiserver_build_info)) > 1", + "expr": "count (count by (gitVersion) (count_over_time (openshift_apiserver_build_info[${__interval}]))) > 1", "name": "OpenShift Upgrade", "textFormat": "OpenShift Upgrade" } diff --git a/resources/grafana/generated/dashboards/rhacs-cluster-overview-configmap.yaml b/resources/grafana/generated/dashboards/rhacs-cluster-overview-configmap.yaml index bfa11a11..06c2a961 100644 --- a/resources/grafana/generated/dashboards/rhacs-cluster-overview-configmap.yaml +++ b/resources/grafana/generated/dashboards/rhacs-cluster-overview-configmap.yaml @@ -37,7 +37,7 @@ data: }, "enable": true, "iconColor": "purple", - "expr": "count (count by (git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"}, \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", + "expr": "count (count by (git_version) (label_replace(count_over_time(kubernetes_build_info{job!~\"kube-dns|coredns\"}[${__interval}]), \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", "name": "Kubernetes Upgrade", "textFormat": "Kubernetes Upgrade" }, @@ -48,7 +48,7 @@ data: }, "enable": true, "iconColor": "red", - "expr": "count (count by (gitVersion) (openshift_apiserver_build_info)) > 1", + "expr": "count (count by (gitVersion) (count_over_time (openshift_apiserver_build_info[${__interval}]))) > 1", "name": "OpenShift Upgrade", "textFormat": "OpenShift Upgrade" } diff --git a/resources/grafana/generated/dashboards/rhacs-cluster-overview-dashboard.yaml b/resources/grafana/generated/dashboards/rhacs-cluster-overview-dashboard.yaml index 6ea35bf0..7c28452d 100644 --- a/resources/grafana/generated/dashboards/rhacs-cluster-overview-dashboard.yaml +++ b/resources/grafana/generated/dashboards/rhacs-cluster-overview-dashboard.yaml @@ -37,7 +37,7 @@ spec: }, "enable": true, "iconColor": "purple", - "expr": "count (count by (git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"}, \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", + "expr": "count (count by (git_version) (label_replace(count_over_time(kubernetes_build_info{job!~\"kube-dns|coredns\"}[${__interval}]), \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", "name": "Kubernetes Upgrade", "textFormat": "Kubernetes Upgrade" }, @@ -48,7 +48,7 @@ spec: }, "enable": true, "iconColor": "red", - "expr": "count (count by (gitVersion) (openshift_apiserver_build_info)) > 1", + "expr": "count (count by (gitVersion) (count_over_time (openshift_apiserver_build_info[${__interval}]))) > 1", "name": "OpenShift Upgrade", "textFormat": "OpenShift Upgrade" } diff --git a/resources/grafana/sources/rhacs-central-slo.json b/resources/grafana/sources/rhacs-central-slo.json index caa3c5a3..d2a77378 100644 --- a/resources/grafana/sources/rhacs-central-slo.json +++ b/resources/grafana/sources/rhacs-central-slo.json @@ -26,7 +26,7 @@ }, "enable": true, "iconColor": "purple", - "expr": "count (count by (git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"}, \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", + "expr": "count (count by (git_version) (label_replace(count_over_time(kubernetes_build_info{job!~\"kube-dns|coredns\"}[${__interval}]), \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", "name": "Kubernetes Upgrade", "textFormat": "Kubernetes Upgrade" }, @@ -37,7 +37,7 @@ }, "enable": true, "iconColor": "red", - "expr": "count (count by (gitVersion) (openshift_apiserver_build_info)) > 1", + "expr": "count (count by (gitVersion) (count_over_time (openshift_apiserver_build_info[${__interval}]))) > 1", "name": "OpenShift Upgrade", "textFormat": "OpenShift Upgrade" } diff --git a/resources/grafana/sources/rhacs-cluster-overview.json b/resources/grafana/sources/rhacs-cluster-overview.json index 8969a623..ca0e6c24 100644 --- a/resources/grafana/sources/rhacs-cluster-overview.json +++ b/resources/grafana/sources/rhacs-cluster-overview.json @@ -26,7 +26,7 @@ }, "enable": true, "iconColor": "purple", - "expr": "count (count by (git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"}, \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", + "expr": "count (count by (git_version) (label_replace(count_over_time(kubernetes_build_info{job!~\"kube-dns|coredns\"}[${__interval}]), \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", "name": "Kubernetes Upgrade", "textFormat": "Kubernetes Upgrade" }, @@ -37,7 +37,7 @@ }, "enable": true, "iconColor": "red", - "expr": "count (count by (gitVersion) (openshift_apiserver_build_info)) > 1", + "expr": "count (count by (gitVersion) (count_over_time (openshift_apiserver_build_info[${__interval}]))) > 1", "name": "OpenShift Upgrade", "textFormat": "OpenShift Upgrade" }