diff --git a/stable/prometheus-operator/Chart.yaml b/stable/prometheus-operator/Chart.yaml index c7f9764c41c1..097859931b28 100644 --- a/stable/prometheus-operator/Chart.yaml +++ b/stable/prometheus-operator/Chart.yaml @@ -12,7 +12,7 @@ sources: - https://github.com/coreos/kube-prometheus - https://github.com/coreos/prometheus-operator - https://coreos.com/operators/prometheus -version: 8.13.11 +version: 8.13.12 appVersion: 0.38.1 tillerVersion: ">=2.12.0" home: https://github.com/coreos/prometheus-operator diff --git a/stable/prometheus-operator/README.md b/stable/prometheus-operator/README.md index 26c50356a4cd..4add3463d079 100644 --- a/stable/prometheus-operator/README.md +++ b/stable/prometheus-operator/README.md @@ -154,6 +154,7 @@ The following tables list the configurable parameters of the prometheus-operator | `defaultRules.rules.k8s` | Create K8S default rules| `true` | | `defaultRules.rules.kubeApiserver` | Create Api Server default rules| `true` | | `defaultRules.rules.kubeApiserverError` | Create Api Server Error default rules| `true` | +| `defaultRules.rules.kubeApiserverSlos` | Create Api Server SLOs default rules| `true` | | `defaultRules.rules.kubePrometheusNodeAlerting` | Create Node Alerting default rules| `true` | | `defaultRules.rules.kubePrometheusNodeRecording` | Create Node Recording default rules| `true` | | `defaultRules.rules.kubeScheduler` | Create Kubernetes Scheduler default rules| `true` | diff --git a/stable/prometheus-operator/hack/sync_prometheus_rules.py b/stable/prometheus-operator/hack/sync_prometheus_rules.py index 26dd14804a1e..3f0d063ffc25 100755 --- a/stable/prometheus-operator/hack/sync_prometheus_rules.py +++ b/stable/prometheus-operator/hack/sync_prometheus_rules.py @@ -55,6 +55,7 @@ def new_representer(dumper, data): 'k8s.rules': ' .Values.defaultRules.rules.k8s', 'kube-apiserver.rules': ' .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserver', 'kube-apiserver-error': ' .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverError', + 'kube-apiserver-slos': ' .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverSlos', 'kube-prometheus-node-alerting.rules': ' .Values.defaultRules.rules.kubePrometheusNodeAlerting', 'kube-prometheus-node-recording.rules': ' .Values.defaultRules.rules.kubePrometheusNodeRecording', 'kube-scheduler.rules': ' .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler', diff --git a/stable/prometheus-operator/templates/prometheus/rules-1.14/etcd.yaml b/stable/prometheus-operator/templates/prometheus/rules-1.14/etcd.yaml index 09fa80a44f5c..b958a55e2dfd 100644 --- a/stable/prometheus-operator/templates/prometheus/rules-1.14/etcd.yaml +++ b/stable/prometheus-operator/templates/prometheus/rules-1.14/etcd.yaml @@ -24,6 +24,21 @@ spec: groups: - name: etcd rules: + - alert: etcdMembersDown + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).' + expr: |- + max by (job) ( + sum by (job) (up{job=~".*etcd.*"} == bool 0) + or + count by (job,endpoint) ( + sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01 + ) + ) + > 0 + for: 3m + labels: + severity: critical - alert: etcdInsufficientMembers annotations: message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).' @@ -40,9 +55,9 @@ spec: severity: critical - alert: etcdHighNumberOfLeaderChanges annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.' - expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 - for: 15m + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' + expr: increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3 + for: 5m labels: severity: warning - alert: etcdHighNumberOfFailedGRPCRequests @@ -87,7 +102,7 @@ spec: severity: warning - alert: etcdHighNumberOfFailedProposals annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 for: 15m labels: diff --git a/stable/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml b/stable/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml index f9a1dff5c669..8f2aa9fbfb21 100644 --- a/stable/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml +++ b/stable/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml @@ -4,7 +4,7 @@ Do not change in-place! In order to change this file first read following link: https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverSlos }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/stable/prometheus-operator/templates/prometheus/rules/etcd.yaml b/stable/prometheus-operator/templates/prometheus/rules/etcd.yaml index 5de9b8328677..dedad1ab6dde 100644 --- a/stable/prometheus-operator/templates/prometheus/rules/etcd.yaml +++ b/stable/prometheus-operator/templates/prometheus/rules/etcd.yaml @@ -24,6 +24,21 @@ spec: groups: - name: etcd rules: + - alert: etcdMembersDown + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).' + expr: |- + max by (job) ( + sum by (job) (up{job=~".*etcd.*"} == bool 0) + or + count by (job,endpoint) ( + sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01 + ) + ) + > 0 + for: 3m + labels: + severity: critical - alert: etcdInsufficientMembers annotations: message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).' @@ -40,9 +55,9 @@ spec: severity: critical - alert: etcdHighNumberOfLeaderChanges annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.' - expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 - for: 15m + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' + expr: increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3 + for: 5m labels: severity: warning - alert: etcdHighNumberOfFailedGRPCRequests @@ -87,7 +102,7 @@ spec: severity: warning - alert: etcdHighNumberOfFailedProposals annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 for: 15m labels: diff --git a/stable/prometheus-operator/values.yaml b/stable/prometheus-operator/values.yaml index 75d5417f4db2..d54ff6b2ebaa 100644 --- a/stable/prometheus-operator/values.yaml +++ b/stable/prometheus-operator/values.yaml @@ -31,6 +31,7 @@ defaultRules: k8s: true kubeApiserver: true kubeApiserverError: true + kubeApiserverSlos: true kubePrometheusNodeAlerting: true kubePrometheusNodeRecording: true kubernetesAbsent: true