Skip to content
This repository has been archived by the owner on Feb 22, 2022. It is now read-only.

Commit

Permalink
make kube-apiserver-slos configurable via defaultRules.rules.kubeApis…
Browse files Browse the repository at this point in the history
…erverSlos (#22499)

Signed-off-by: Mario Hros <[email protected]>

Co-authored-by: Mario Hros <[email protected]>
  • Loading branch information
k3a and Mario Hros authored Jun 2, 2020
1 parent 2cea7e0 commit 88a5176
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 10 deletions.
2 changes: 1 addition & 1 deletion stable/prometheus-operator/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ sources:
- https://github.com/coreos/kube-prometheus
- https://github.com/coreos/prometheus-operator
- https://coreos.com/operators/prometheus
version: 8.13.11
version: 8.13.12
appVersion: 0.38.1
tillerVersion: ">=2.12.0"
home: https://github.com/coreos/prometheus-operator
Expand Down
1 change: 1 addition & 0 deletions stable/prometheus-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ The following tables list the configurable parameters of the prometheus-operator
| `defaultRules.rules.k8s` | Create K8S default rules| `true` |
| `defaultRules.rules.kubeApiserver` | Create Api Server default rules| `true` |
| `defaultRules.rules.kubeApiserverError` | Create Api Server Error default rules| `true` |
| `defaultRules.rules.kubeApiserverSlos` | Create Api Server SLOs default rules| `true` |
| `defaultRules.rules.kubePrometheusNodeAlerting` | Create Node Alerting default rules| `true` |
| `defaultRules.rules.kubePrometheusNodeRecording` | Create Node Recording default rules| `true` |
| `defaultRules.rules.kubeScheduler` | Create Kubernetes Scheduler default rules| `true` |
Expand Down
1 change: 1 addition & 0 deletions stable/prometheus-operator/hack/sync_prometheus_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def new_representer(dumper, data):
'k8s.rules': ' .Values.defaultRules.rules.k8s',
'kube-apiserver.rules': ' .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserver',
'kube-apiserver-error': ' .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverError',
'kube-apiserver-slos': ' .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverSlos',
'kube-prometheus-node-alerting.rules': ' .Values.defaultRules.rules.kubePrometheusNodeAlerting',
'kube-prometheus-node-recording.rules': ' .Values.defaultRules.rules.kubePrometheusNodeRecording',
'kube-scheduler.rules': ' .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,21 @@ spec:
groups:
- name: etcd
rules:
- alert: etcdMembersDown
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
expr: |-
max by (job) (
sum by (job) (up{job=~".*etcd.*"} == bool 0)
or
count by (job,endpoint) (
sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01
)
)
> 0
for: 3m
labels:
severity: critical
- alert: etcdInsufficientMembers
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
Expand All @@ -40,9 +55,9 @@ spec:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.'
expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
for: 15m
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
expr: increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3
for: 5m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
Expand Down Expand Up @@ -87,7 +102,7 @@ spec:
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Do not change in-place! In order to change this file first read following link:
https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverSlos }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand Down
23 changes: 19 additions & 4 deletions stable/prometheus-operator/templates/prometheus/rules/etcd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,21 @@ spec:
groups:
- name: etcd
rules:
- alert: etcdMembersDown
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
expr: |-
max by (job) (
sum by (job) (up{job=~".*etcd.*"} == bool 0)
or
count by (job,endpoint) (
sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01
)
)
> 0
for: 3m
labels:
severity: critical
- alert: etcdInsufficientMembers
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
Expand All @@ -40,9 +55,9 @@ spec:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.'
expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
for: 15m
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
expr: increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3
for: 5m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
Expand Down Expand Up @@ -87,7 +102,7 @@ spec:
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
Expand Down
1 change: 1 addition & 0 deletions stable/prometheus-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ defaultRules:
k8s: true
kubeApiserver: true
kubeApiserverError: true
kubeApiserverSlos: true
kubePrometheusNodeAlerting: true
kubePrometheusNodeRecording: true
kubernetesAbsent: true
Expand Down

0 comments on commit 88a5176

Please sign in to comment.