Skip to content

Commit

Permalink
Merge pull request #120 from appuio/dep/logging-5.8
Browse files Browse the repository at this point in the history
Support OpenShift Logging 5.8
  • Loading branch information
DebakelOrakel authored Jan 11, 2024
2 parents 890077b + 2a308e7 commit a79ee05
Show file tree
Hide file tree
Showing 22 changed files with 819 additions and 237 deletions.
5 changes: 4 additions & 1 deletion alerts.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.6/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.6/fluentd_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.7/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.7/fluentd_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/cluster-logging-operator/master/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert master/fluentd_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.8/config/prometheus/collector_alerts.yaml release-5.8/collector_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/cluster-logging-operator/master/config/prometheus/collector_alerts.yaml master/collector_prometheus_alerts.yaml

https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.6/files/prometheus_alerts.yml release-5.6/elasticsearch_operator_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.7/files/prometheus_alerts.yml release-5.7/elasticsearch_operator_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.8/elasticsearch_operator_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/elasticsearch-operator/master/files/prometheus_alerts.yml master/elasticsearch_operator_prometheus_alerts.yaml

https://raw.githubusercontent.com/openshift/loki/release-5.6/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.6/lokistack_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/loki/release-5.7/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.7/lokistack_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/loki/release-5.8/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.8/lokistack_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/loki/main/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml master/lokistack_prometheus_alerts.yaml
4 changes: 2 additions & 2 deletions class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ parameters:
"False": {}

namespace: openshift-logging
version: '5.7'
version: '5.8'
channel: 'stable-${openshift4_logging:version}'
alerts: 'release-${openshift4_logging:version}'

Expand All @@ -29,7 +29,7 @@ parameters:
endpoint: ''
bucketnames: '${cluster:name}-logstore'
spec:
size: 1x.extra-small
size: 1x.demo
storage:
schemas:
- version: v12
Expand Down
10 changes: 9 additions & 1 deletion component/alertrules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,17 @@ local prometheus_rules(name, groups, baseURL) = kube._Object('monitoring.coreos.

// Elasticstack alerts

local isVersion58 = if params.version == '5.8' || params.version == 'master' then true else false;

local esStorageGroup = {
name: 'elasticsearch_node_storage.alerts',
rules: [ predictESStorage ],
};
local fluentdGroup = if !isVersion58 then loadFile('fluentd_prometheus_alerts.yaml')[0].groups else [];

local esGroups =
loadFile('elasticsearch_operator_prometheus_alerts.yaml')[0].groups +
loadFile('fluentd_prometheus_alerts.yaml')[0].groups +
fluentdGroup +
[
if predict_storage_alert.enabled then esStorageGroup,
];
Expand All @@ -143,7 +146,12 @@ local esBaseURL = 'https://github.com/openshift/elasticsearch-operator/blob/mast
local lokiGroups = loadFile('lokistack_prometheus_alerts.yaml')[0].groups;
local lokiBaseURL = 'https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md';

// Collector alerts

local collectorGroups = loadFile('collector_prometheus_alerts.yaml')[0].spec.groups;

{
[if elasticsearch.enabled then '60_elasticsearch_alerts']: prometheus_rules('syn-elasticsearch-logging-rules', esGroups, esBaseURL),
[if loki.enabled then '60_lokistack_alerts']: prometheus_rules('syn-loki-logging-rules', lokiGroups, lokiBaseURL),
[if isVersion58 then '60_collector_alerts']: prometheus_rules('syn-collector-rules', collectorGroups, ''),
}
71 changes: 71 additions & 0 deletions component/extracted_alerts/master/collector_prometheus_alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: collector
namespace: openshift-logging
spec:
groups:
- name: logging_collector.alerts
rules:
- alert: CollectorNodeDown
annotations:
message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
summary: "Collector cannot be scraped"
expr: |
up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0
for: 10m
labels:
service: collector
severity: critical
- alert: CollectorHighErrorRate
annotations:
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high"
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.001
for: 15m
labels:
service: collector
severity: critical
- alert: CollectorVeryHighErrorRate
annotations:
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high"
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.05
for: 15m
labels:
service: collector
severity: critical
- alert: FluentdQueueLengthIncreasing
annotations:
message: "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously."
summary: "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}."
expr: |
sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 )
for: 1h
labels:
service: collector
severity: Warning
- name: logging_clusterlogging_telemetry.rules
rules:
- expr: |
sum by(cluster)(log_collected_bytes_total)
record: cluster:log_collected_bytes_total:sum
- expr: |
sum by(cluster)(log_logged_bytes_total)
record: cluster:log_logged_bytes_total:sum
- expr: |
sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_errors_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_num_errors[2m]))
record: collector:log_num_errors:sum_rate
- expr: |
sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_received_events_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_emit_records[2m]))
record: collector:received_events:sum_rate
64 changes: 0 additions & 64 deletions component/extracted_alerts/master/fluentd_prometheus_alerts.yaml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: collector
namespace: openshift-logging
spec:
groups:
- name: logging_collector.alerts
rules:
- alert: CollectorNodeDown
annotations:
message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
summary: "Collector cannot be scraped"
expr: |
up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0
for: 10m
labels:
service: collector
severity: critical
- alert: CollectorHighErrorRate
annotations:
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high"
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.001
for: 15m
labels:
service: collector
severity: critical
- alert: CollectorVeryHighErrorRate
annotations:
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high"
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.05
for: 15m
labels:
service: collector
severity: critical
- alert: FluentdQueueLengthIncreasing
annotations:
message: "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously."
summary: "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}."
expr: |
sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 )
for: 1h
labels:
service: collector
severity: Warning
- name: logging_clusterlogging_telemetry.rules
rules:
- expr: |
sum by(cluster)(log_collected_bytes_total)
record: cluster:log_collected_bytes_total:sum
- expr: |
sum by(cluster)(log_logged_bytes_total)
record: cluster:log_logged_bytes_total:sum
- expr: |
sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_errors_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_num_errors[2m]))
record: collector:log_num_errors:sum_rate
- expr: |
sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_received_events_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_emit_records[2m]))
record: collector:received_events:sum_rate
Loading

0 comments on commit a79ee05

Please sign in to comment.