diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index b45f78d8c1..e9b7da3017 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -236,39 +236,26 @@ rules: ## Sidecar -[//]: # "TODO(kakkoyun): Generate sidecar rules using thanos-mixin." - +[embedmd]:# (../tmp/thanos-sidecar.rules.yaml yaml) ```yaml +name: thanos-sidecar.rules +rules: - alert: ThanosSidecarPrometheusDown - expr: thanos_sidecar_prometheus_up{name="prometheus"} == 0 - for: 5m - labels: - team: TEAM annotations: - summary: Thanos Sidecar cannot connect to Prometheus - impact: Prometheus configuration is not being refreshed - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: SIDECAR_URL -- alert: ThanosSidecarBucketOperationsFailed - expr: rate(thanos_objstore_bucket_operation_failures_total{name="prometheus"}[5m]) > 0 + message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect to Prometheus. + expr: | + sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0) for: 5m labels: - team: TEAM + severity: critical +- alert: ThanosSidecarUnhealthy annotations: - summary: Thanos Sidecar bucket operations are failing - impact: We will lose metrics data if not fixed in 24h - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: SIDECAR_URL -- alert: ThanosSidecarGrpcErrorRate - expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable",name="prometheus"}[5m]) > 0 - for: 5m + message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value + }} seconds. + expr: | + count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0 labels: - team: TEAM - annotations: - summary: Thanos Sidecar is returning Internal/Unavailable errors - impact: Prometheus queries are failing - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: SIDECAR_URL + severity: critical ``` ## Query diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index a1e77d932a..8db6674442 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -212,6 +212,14 @@ groups: severity: warning - name: thanos-sidecar.rules rules: + - alert: ThanosSidecarPrometheusDown + annotations: + message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect to Prometheus. + expr: | + sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0) + for: 5m + labels: + severity: critical - alert: ThanosSidecarUnhealthy annotations: message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ diff --git a/mixin/thanos/alerts/sidecar.libsonnet b/mixin/thanos/alerts/sidecar.libsonnet index 7c80e3af08..58aee3bb0b 100644 --- a/mixin/thanos/alerts/sidecar.libsonnet +++ b/mixin/thanos/alerts/sidecar.libsonnet @@ -9,6 +9,19 @@ { name: 'thanos-sidecar.rules', rules: [ + { + alert: 'ThanosSidecarPrometheusDown', + annotations: { + message: 'Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect to Prometheus.', + }, + expr: ||| + sum by (job, pod) (thanos_sidecar_prometheus_up{%(selector)s} == 0) + ||| % thanos.sidecar, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, { alert: 'ThanosSidecarUnhealthy', annotations: {