diff --git a/jobs/bosh_alerts/spec b/jobs/bosh_alerts/spec index 5ad3d1aa..a019ddfd 100644 --- a/jobs/bosh_alerts/spec +++ b/jobs/bosh_alerts/spec @@ -117,3 +117,6 @@ properties: bosh_alerts.tsdb_message_too_old.evaluation_time: description: "TSDB Message too old alert evaluation time" default: 5m + bosh_alerts.disable_values_in_alert_msg: + description: "Remove values from alert messages" + default: false diff --git a/jobs/bosh_alerts/templates/bosh_system.alerts.yml b/jobs/bosh_alerts/templates/bosh_system.alerts.yml index 2dbbb8e5..016a290b 100644 --- a/jobs/bosh_alerts/templates/bosh_system.alerts.yml +++ b/jobs/bosh_alerts/templates/bosh_system.alerts.yml @@ -9,7 +9,7 @@ groups: severity: warning annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is reporting a high CPU load average" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` had a CPU load average (1m) above <%= p('bosh_alerts.job_high_cpu_load.threshold') %> for <%= p('bosh_alerts.job_high_cpu_load.evaluation_time') %>: {{$value}}" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` had a CPU load average (1m) above <%= p('bosh_alerts.job_high_cpu_load.threshold') %> for <%= p('bosh_alerts.job_high_cpu_load.evaluation_time') %> <% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}' %> <% end %> <% end %>" - alert: BOSHJobLowFreeRAM expr: avg(bosh_job_mem_percent{bosh_job_name!~"^compilation.*",bosh_deployment!="bosh-health-check"}) by(environment, bosh_name, bosh_deployment, bosh_job_name, bosh_job_index) > <%= p('bosh_alerts.job_low_free_ram.threshold') %> @@ -19,7 +19,7 @@ groups: severity: warning annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is reporting low free RAM" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_low_free_ram.threshold') %>% of its RAM for <%= p('bosh_alerts.job_low_free_ram.evaluation_time') %>: {{$value}}%" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_low_free_ram.threshold') %>% of its RAM for <%= p('bosh_alerts.job_low_free_ram.evaluation_time') %><% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}%' %> <% end %> <% end %>" - alert: BOSHJobLowSwap expr: avg(bosh_job_swap_percent{bosh_job_name!~"^compilation.*",bosh_deployment!="bosh-health-check"}) by(environment, bosh_name, bosh_deployment, bosh_job_name, bosh_job_index) > <%= p('bosh_alerts.job_low_swap.threshold') %> @@ -29,7 +29,7 @@ groups: severity: warning annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is reporting low swap" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_low_swap.threshold') %>% of its swap for <%= p('bosh_alerts.job_low_swap.evaluation_time') %>: {{$value}}%" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_low_swap.threshold') %>% of its swap for <%= p('bosh_alerts.job_low_swap.evaluation_time') %><% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}%' %> <% end %> <% end %>" - alert: BOSHJobSystemDiskFull expr: avg(bosh_job_system_disk_percent{bosh_job_name!~"^compilation.*",bosh_deployment!="bosh-health-check"}) by(environment, bosh_name, bosh_deployment, bosh_job_name, bosh_job_index) > <%= p('bosh_alerts.job_system_disk_full.threshold') %> @@ -39,7 +39,7 @@ groups: severity: critical annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is running out of system disk" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_system_disk_full.threshold') %>% of its system disk for <%= p('bosh_alerts.job_system_disk_full.evaluation_time') %>: {{$value}}%" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_system_disk_full.threshold') %>% of its system disk for <%= p('bosh_alerts.job_system_disk_full.evaluation_time') %><% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}%' %> <% end %> <% end %>" - alert: BOSHJobEphemeralDiskFull expr: avg(bosh_job_ephemeral_disk_percent{bosh_job_name!~"^compilation.*",bosh_deployment!="bosh-health-check"}) by(environment, bosh_name, bosh_deployment, bosh_job_name, bosh_job_index) > <%= p('bosh_alerts.job_ephemeral_disk_full.threshold') %> @@ -49,7 +49,7 @@ groups: severity: critical annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is running out of ephemeral disk" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_ephemeral_disk_full.threshold') %>% of its ephemeral disk for <%= p('bosh_alerts.job_ephemeral_disk_full.evaluation_time') %>s: {{$value}}%" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_ephemeral_disk_full.threshold') %>% of its ephemeral disk for <%= p('bosh_alerts.job_ephemeral_disk_full.evaluation_time') %>s<% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}%' %> <% end %> <% end %>" - alert: BOSHJobPersistentDiskFull expr: avg(bosh_job_persistent_disk_percent{bosh_job_name!~"^compilation.*",bosh_deployment!="bosh-health-check"}) by(environment, bosh_name, bosh_deployment, bosh_job_name, bosh_job_index) > <%= p('bosh_alerts.job_persistent_disk_full.threshold') %> @@ -59,7 +59,7 @@ groups: severity: critical annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is running out of persistent disk" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_persistent_disk_full.threshold') %>% of its persistent disk for <%= p('bosh_alerts.job_persistent_disk_full.evaluation_time') %>: {{$value}}%" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_persistent_disk_full.threshold') %>% of its persistent disk for <%= p('bosh_alerts.job_persistent_disk_full.evaluation_time') %><% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}%' %> <% end %> <% end %>" - alert: BOSHJobPersistentDiskInodesExhausted expr: avg(bosh_job_persistent_disk_inode_percent{bosh_job_name!~"^compilation.*",bosh_deployment!="bosh-health-check"}) by(environment, bosh_name, bosh_deployment, bosh_job_name, bosh_job_index) > <%= p('bosh_alerts.job_persistent_disk_inodes_exhausted.threshold') %> @@ -69,4 +69,4 @@ groups: severity: critical annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is running out of inodes" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_persistent_disk_inodes_exhausted.threshold') %>% of its persistent disk inodes for <%= p('bosh_alerts.job_persistent_disk_inodes_exhausted.evaluation_time') %>: {{$value}}%" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_name}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_persistent_disk_inodes_exhausted.threshold') %>% of its persistent disk inodes for <%= p('bosh_alerts.job_persistent_disk_inodes_exhausted.evaluation_time') %><% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}%' %> <% end %> <% end %>" diff --git a/jobs/bosh_alerts/templates/bosh_tsdb_system.alerts.yml b/jobs/bosh_alerts/templates/bosh_tsdb_system.alerts.yml index 8b5c0b71..f0490aed 100644 --- a/jobs/bosh_alerts/templates/bosh_tsdb_system.alerts.yml +++ b/jobs/bosh_alerts/templates/bosh_tsdb_system.alerts.yml @@ -9,7 +9,7 @@ groups: severity: warning annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is reporting a high CPU load average" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` had a CPU load average (1m) above <%= p('bosh_alerts.job_high_cpu_load.threshold') %> for <%= p('bosh_alerts.job_high_cpu_load.evaluation_time') %>: {{$value}}" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` had a CPU load average (1m) above <%= p('bosh_alerts.job_high_cpu_load.threshold') %> for <%= p('bosh_alerts.job_high_cpu_load.evaluation_time') %><% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}' %> <% end %> <% end %>" - alert: BOSHTSDBJobLowFreeRAM expr: avg(bosh_tsdb_job_mem_percent{bosh_job_name!~"^compilation.*",bosh_deployment!="bosh-health-check"}) by(environment, bosh_deployment, bosh_job_name, bosh_job_index) > <%= p('bosh_alerts.job_low_free_ram.threshold') %> @@ -19,7 +19,7 @@ groups: severity: warning annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is reporting low free RAM" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_low_free_ram.threshold') %>% of its RAM for <%= p('bosh_alerts.job_low_free_ram.evaluation_time') %>: {{$value}}%" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_low_free_ram.threshold') %>% of its RAM for <%= p('bosh_alerts.job_low_free_ram.evaluation_time') %><% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}%' %> <% end %> <% end %>" - alert: BOSHTSDBJobLowSwap expr: avg(bosh_tsdb_job_swap_percent{bosh_job_name!~"^compilation.*",bosh_deployment!="bosh-health-check"}) by(environment, bosh_deployment, bosh_job_name, bosh_job_index) > <%= p('bosh_alerts.job_low_swap.threshold') %> @@ -29,7 +29,7 @@ groups: severity: warning annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is reporting low swap" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_low_swap.threshold') %>% of its swap for <%= p('bosh_alerts.job_low_swap.evaluation_time') %>: {{$value}}%" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_low_swap.threshold') %>% of its swap for <%= p('bosh_alerts.job_low_swap.evaluation_time') %><% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}%' %> <% end %> <% end %>" - alert: BOSHTSDBJobSystemDiskFull expr: avg(bosh_tsdb_job_system_disk_percent{bosh_job_name!~"^compilation.*",bosh_deployment!="bosh-health-check"}) by(environment, bosh_deployment, bosh_job_name, bosh_job_index) > <%= p('bosh_alerts.job_system_disk_full.threshold') %> @@ -39,7 +39,7 @@ groups: severity: critical annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is running out of system disk" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_system_disk_full.threshold') %>% of its system disk for <%= p('bosh_alerts.job_system_disk_full.evaluation_time') %>: {{$value}}%" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_system_disk_full.threshold') %>% of its system disk for <%= p('bosh_alerts.job_system_disk_full.evaluation_time') %><% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}%' %> <% end %> <% end %>" - alert: BOSHTSDBJobEphemeralDiskFull expr: avg(bosh_tsdb_job_ephemeral_disk_percent{bosh_job_name!~"^compilation.*",bosh_deployment!="bosh-health-check"}) by(environment, bosh_deployment, bosh_job_name, bosh_job_index) > <%= p('bosh_alerts.job_ephemeral_disk_full.threshold') %> @@ -49,7 +49,7 @@ groups: severity: critical annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is running out of ephemeral disk" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_ephemeral_disk_full.threshold') %>% of its ephemeral disk for <%= p('bosh_alerts.job_ephemeral_disk_full.evaluation_time') %>: {{$value}}%" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_ephemeral_disk_full.threshold') %>% of its ephemeral disk for <%= p('bosh_alerts.job_ephemeral_disk_full.evaluation_time') %><% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}%' %> <% end %> <% end %>" - alert: BOSHTSDBJobPersistentDiskFull expr: avg(bosh_tsdb_job_persistent_disk_percent{bosh_job_name!~"^compilation.*",bosh_deployment!="bosh-health-check"}) by(environment, bosh_deployment, bosh_job_name, bosh_job_index) > <%= p('bosh_alerts.job_persistent_disk_full.threshold') %> @@ -59,4 +59,4 @@ groups: severity: critical annotations: summary: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` is running out of persistent disk" - description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_persistent_disk_full.threshold') %>% of its persistent disk for <%= p('bosh_alerts.job_persistent_disk_full.evaluation_time') %>: {{$value}}%" + description: "BOSH Job `{{$labels.environment}}/{{$labels.bosh_deployment}}/{{$labels.bosh_job_name}}/{{$labels.bosh_job_index}}` has used more than <%= p('bosh_alerts.job_persistent_disk_full.threshold') %>% of its persistent disk for <%= p('bosh_alerts.job_persistent_disk_full.evaluation_time') %><% if_p('bosh_alerts.disable_values_in_alert_msg') do | disabled | %> <% if !(disabled) %> <%= ': {{$value}}%' %> <% end %> <% end %>"