Skip to content

Commit

Permalink
feat: Add redfish alert rules for null health.
Browse files Browse the repository at this point in the history
Add new alert rules with lower severity (warning) for when the health
data isn't present and set with either "NA" or "N/A".

This prevents diluting the real critical health related alerts when the health
data isn't available.
  • Loading branch information
dashmage committed Dec 8, 2023
1 parent 08c5079 commit b00b228
Show file tree
Hide file tree
Showing 2 changed files with 202 additions and 6 deletions.
79 changes: 73 additions & 6 deletions src/prometheus_alert_rules/redfish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ groups:
LABELS = {{ $labels }}
- alert: RedfishSensorHealthNotOk
expr: redfish_sensor_info{health != "OK"}
expr: redfish_sensor_info{health!~"OK|N/A"}
for: 0m
labels:
severity: critical
Expand All @@ -37,8 +37,20 @@ groups:
SENSOR_READING = {{ $labels.reading }}
LABELS = {{ $labels }}
- alert: RedfishSensorHealthNotAvailable
expr: redfish_sensor_info{health="N/A"}
for: 0m
labels:
severity: warning
annotations:
summary: Redfish sensor health not available. (instance {{ $labels.instance }})
description: |
Redfish sensor health not available.
SENSOR_READING = {{ $labels.reading }}
LABELS = {{ $labels }}
- alert: RedfishProcessorHealthNotOk
expr: redfish_processor_info{health != "OK"}
expr: redfish_processor_info{health!~"OK|NA"}
for: 0m
labels:
severity: critical
Expand All @@ -48,8 +60,19 @@ groups:
Redfish processor health not OK.
LABELS = {{ $labels }}
- alert: RedfishProcessorHealthNotAvailable
expr: redfish_processor_info{health="NA"}
for: 0m
labels:
severity: warning
annotations:
summary: Redfish processor health not available. (instance {{ $labels.instance }})
description: |
Redfish processor health not available.
LABELS = {{ $labels }}
- alert: RedfishStorageControllerHealthNotOk
expr: redfish_storage_controller_info{health != "OK"}
expr: redfish_storage_controller_info{health!~"OK|NA"}
for: 0m
labels:
severity: critical
Expand All @@ -59,8 +82,19 @@ groups:
Redfish storage controller health not OK.
LABELS = {{ $labels }}
- alert: RedfishStorageControllerHealthNotAvailable
expr: redfish_storage_controller_info{health="NA"}
for: 0m
labels:
severity: warning
annotations:
summary: Redfish storage controller health not available. (instance {{ $labels.instance }})
description: |
Redfish storage controller health not available.
LABELS = {{ $labels }}
- alert: RedfishChassisHealthNotOk
expr: redfish_chassis_info{health != "OK"}
expr: redfish_chassis_info{health!~"OK|NA"}
for: 0m
labels:
severity: critical
Expand All @@ -70,8 +104,19 @@ groups:
Redfish chassis health not OK.
LABELS = {{ $labels }}
- alert: RedfishChassisHealthNotAvailable
expr: redfish_chassis_info{health="NA"}
for: 0m
labels:
severity: warning
annotations:
summary: Redfish chassis health not available. (instance {{ $labels.instance }})
description: |
Redfish chassis health not available.
LABELS = {{ $labels }}
- alert: RedfishStorageDriveHealthNotOk
expr: redfish_storage_drive_info{health != "OK"}
expr: redfish_storage_drive_info{health!~"OK|NA"}
for: 0m
labels:
severity: critical
Expand All @@ -81,8 +126,19 @@ groups:
Redfish storage drive health not OK.
LABELS = {{ $labels }}
- alert: RedfishStorageDriveHealthNotAvailable
expr: redfish_storage_drive_info{health="NA"}
for: 0m
labels:
severity: warning
annotations:
summary: Redfish storage drive health not available. (instance {{ $labels.instance }})
description: |
Redfish storage drive health not available.
LABELS = {{ $labels }}
- alert: RedfishMemoryDimmHealthNotOk
expr: redfish_memory_dimm_info{health != "OK"}
expr: redfish_memory_dimm_info{health!~"OK|NA"}
for: 0m
labels:
severity: critical
Expand All @@ -92,6 +148,17 @@ groups:
Redfish memory dimm health not OK.
LABELS = {{ $labels }}
- alert: RedfishMemoryDimmHealthNotAvailable
expr: redfish_memory_dimm_info{health="NA"}
for: 0m
labels:
severity: warning
annotations:
summary: Redfish memory dimm health not available. (instance {{ $labels.instance }})
description: |
Redfish memory dimm health not available.
LABELS = {{ $labels }}
- alert: RedfishSmartStorageHealthNotOk
expr: redfish_smart_storage_health == 0
for: 0m
Expand Down
129 changes: 129 additions & 0 deletions tests/unit/test_alert_rules/test_redfish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,27 @@ tests:
SENSOR_READING = 82%
LABELS = map[__name__:redfish_sensor_info health:Unhealthy instance:ubuntu-2 reading:82%]
- interval: 1m
input_series:
- series: redfish_sensor_info{instance="ubuntu-2", health="N/A", reading="N/A"}
values: "1x15"

alert_rule_test:
- eval_time: 0m
alertname: RedfishSensorHealthNotAvailable
exp_alerts:
- exp_labels:
severity: warning
instance: ubuntu-2
health: N/A
reading: N/A
exp_annotations:
summary: Redfish sensor health not available. (instance ubuntu-2)
description: |
Redfish sensor health not available.
SENSOR_READING = N/A
LABELS = map[__name__:redfish_sensor_info health:N/A instance:ubuntu-2 reading:N/A]
- interval: 1m
input_series:
- series: redfish_processor_info{instance="ubuntu-1", health="Unhealthy", system_id="s1", processor_id="p1", model="processor-model-1"}
Expand All @@ -85,6 +106,28 @@ tests:
Redfish processor health not OK.
LABELS = map[__name__:redfish_processor_info health:Unhealthy instance:ubuntu-1 model:processor-model-1 processor_id:p1 system_id:s1]
- interval: 1m
input_series:
- series: redfish_processor_info{instance="ubuntu-1", health="NA", system_id="s1", processor_id="p1", model="processor-model-1"}
values: "1x15"

alert_rule_test:
- eval_time: 0m
alertname: RedfishProcessorHealthNotAvailable
exp_alerts:
- exp_labels:
severity: warning
instance: ubuntu-1
health: NA
system_id: s1
processor_id: p1
model: processor-model-1
exp_annotations:
summary: Redfish processor health not available. (instance ubuntu-1)
description: |
Redfish processor health not available.
LABELS = map[__name__:redfish_processor_info health:NA instance:ubuntu-1 model:processor-model-1 processor_id:p1 system_id:s1]
- interval: 1m
input_series:
- series: redfish_storage_controller_info{instance="ubuntu-1", health="Unhealthy", system_id="s1", storage_id="stor1", controller_id="ctrl1"}
Expand All @@ -107,6 +150,28 @@ tests:
Redfish storage controller health not OK.
LABELS = map[__name__:redfish_storage_controller_info controller_id:ctrl1 health:Unhealthy instance:ubuntu-1 storage_id:stor1 system_id:s1]
- interval: 1m
input_series:
- series: redfish_storage_controller_info{instance="ubuntu-1", health="NA", system_id="s1", storage_id="stor1", controller_id="ctrl1"}
values: "1x15"

alert_rule_test:
- eval_time: 0m
alertname: RedfishStorageControllerHealthNotAvailable
exp_alerts:
- exp_labels:
severity: warning
instance: ubuntu-1
health: NA
system_id: s1
storage_id: stor1
controller_id: ctrl1
exp_annotations:
summary: Redfish storage controller health not available. (instance ubuntu-1)
description: |
Redfish storage controller health not available.
LABELS = map[__name__:redfish_storage_controller_info controller_id:ctrl1 health:NA instance:ubuntu-1 storage_id:stor1 system_id:s1]
- interval: 1m
input_series:
- series: redfish_chassis_info{instance="ubuntu-1", health="Unhealthy", chassis_id="ch1", model="chassis-model1"}
Expand All @@ -128,6 +193,27 @@ tests:
Redfish chassis health not OK.
LABELS = map[__name__:redfish_chassis_info chassis_id:ch1 health:Unhealthy instance:ubuntu-1 model:chassis-model1]
- interval: 1m
input_series:
- series: redfish_chassis_info{instance="ubuntu-1", health="NA", chassis_id="ch1", model="chassis-model1"}
values: "1x15"

alert_rule_test:
- eval_time: 0m
alertname: RedfishChassisHealthNotAvailable
exp_alerts:
- exp_labels:
severity: warning
instance: ubuntu-1
health: NA
chassis_id: ch1
model: chassis-model1
exp_annotations:
summary: Redfish chassis health not available. (instance ubuntu-1)
description: |
Redfish chassis health not available.
LABELS = map[__name__:redfish_chassis_info chassis_id:ch1 health:NA instance:ubuntu-1 model:chassis-model1]
- interval: 1m
input_series:
- series: redfish_storage_drive_info{instance="ubuntu-1", health="Unhealthy", system_id="s1", storage_id="stor1", drive_id="dr1"}
Expand All @@ -150,6 +236,28 @@ tests:
Redfish storage drive health not OK.
LABELS = map[__name__:redfish_storage_drive_info drive_id:dr1 health:Unhealthy instance:ubuntu-1 storage_id:stor1 system_id:s1]
- interval: 1m
input_series:
- series: redfish_storage_drive_info{instance="ubuntu-1", health="NA", system_id="s1", storage_id="stor1", drive_id="dr1"}
values: "1x15"

alert_rule_test:
- eval_time: 0m
alertname: RedfishStorageDriveHealthNotAvailable
exp_alerts:
- exp_labels:
severity: warning
instance: ubuntu-1
health: NA
system_id: s1
storage_id: stor1
drive_id: dr1
exp_annotations:
summary: Redfish storage drive health not available. (instance ubuntu-1)
description: |
Redfish storage drive health not available.
LABELS = map[__name__:redfish_storage_drive_info drive_id:dr1 health:NA instance:ubuntu-1 storage_id:stor1 system_id:s1]
- interval: 1m
input_series:
- series: redfish_memory_dimm_info{instance="ubuntu-1", health="Unhealthy", system_id="s1", memory_id="mem1"}
Expand All @@ -171,6 +279,27 @@ tests:
Redfish memory dimm health not OK.
LABELS = map[__name__:redfish_memory_dimm_info health:Unhealthy instance:ubuntu-1 memory_id:mem1 system_id:s1]
- interval: 1m
input_series:
- series: redfish_memory_dimm_info{instance="ubuntu-1", health="NA", system_id="s1", memory_id="mem1"}
values: "1x15"

alert_rule_test:
- eval_time: 0m
alertname: RedfishMemoryDimmHealthNotAvailable
exp_alerts:
- exp_labels:
severity: warning
instance: ubuntu-1
health: NA
system_id: s1
memory_id: mem1
exp_annotations:
summary: Redfish memory dimm health not available. (instance ubuntu-1)
description: |
Redfish memory dimm health not available.
LABELS = map[__name__:redfish_memory_dimm_info health:NA instance:ubuntu-1 memory_id:mem1 system_id:s1]
- interval: 1m
input_series:
- series: redfish_smart_storage_health{instance="ubuntu-1"}
Expand Down

0 comments on commit b00b228

Please sign in to comment.