Skip to content

Commit

Permalink
Azure Update Management monitoring with Fame - missing updates
Browse files Browse the repository at this point in the history
  • Loading branch information
BzSpi committed Apr 14, 2023
1 parent ed85085 commit db650ab
Show file tree
Hide file tree
Showing 13 changed files with 252 additions and 28 deletions.
6 changes: 4 additions & 2 deletions docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@

|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Azure Automation Update failed updates|X|-|-|-|-|
|Azure Automation Update failed updates|-|X|-|-|-|
|Azure Automation Update missing updates|-|X|-|-|-|


## fame_azure-storage-file-backup
Expand All @@ -137,7 +138,8 @@

|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Azure Update Center failed updates|X|-|-|-|-|
|Azure Update Center failed updates|-|X|-|-|-|
|Azure Update Center missing updates|-|X|-|-|-|


## fame_azure-vm-backup
Expand Down
4 changes: 3 additions & 1 deletion modules/fame_azure-automation-updates/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ This module creates the following SignalFx detectors which could contain one or

|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Azure Automation Update failed updates|X|-|-|-|-|
|Azure Automation Update failed updates|-|X|-|-|-|
|Azure Automation Update missing updates|-|X|-|-|-|

## How to collect required metrics?

Expand All @@ -95,6 +96,7 @@ Check the [Related documentation](#related-documentation) section for more detai

Here is the list of required metrics for detectors in this module.

* `fame.azure.automation_update.missing_updates`
* `fame.azure.automation_update.updates_status`


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ signals:
filter: filter('status', 'failed')

rules:
critical:
major:
threshold: 0
comparator: ">"
lasting_duration: '2h'
lasting_duration: '6h'
16 changes: 16 additions & 0 deletions modules/fame_azure-automation-updates/conf/02-missing-updates.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
module: Azure Automation Update
name: missing updates
transformation: true
aggregation: true

max_delay: 900
signals:
signal:
metric: fame.azure.automation_update.missing_updates
filter: filter('classification', 'security', 'critical')

rules:
major:
threshold: 0
comparator: ">"
lasting_duration: '35d'
37 changes: 32 additions & 5 deletions modules/fame_azure-automation-updates/detectors-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@ resource "signalfx_detector" "failed_updates" {

program_text = <<-EOF
signal = data('fame.azure.automation_update.updates_status', filter=filter('status', 'failed') and ${module.filtering.signalflow})${var.failed_updates_aggregation_function}${var.failed_updates_transformation_function}.publish('signal')
detect(when(signal > ${var.failed_updates_threshold_critical}, lasting=%{if var.failed_updates_lasting_duration_critical == null}None%{else}'${var.failed_updates_lasting_duration_critical}'%{endif}, at_least=${var.failed_updates_at_least_percentage_critical})).publish('CRIT')
detect(when(signal > ${var.failed_updates_threshold_major}, lasting=%{if var.failed_updates_lasting_duration_major == null}None%{else}'${var.failed_updates_lasting_duration_major}'%{endif}, at_least=${var.failed_updates_at_least_percentage_major})).publish('MAJOR')
EOF

rule {
description = "is too high > ${var.failed_updates_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
description = "is too high > ${var.failed_updates_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.failed_updates_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.failed_updates_notifications, "critical", []), var.notifications.critical), null)
notifications = try(coalescelist(lookup(var.failed_updates_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.failed_updates_runbook_url, var.runbook_url), "")
tip = var.failed_updates_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
Expand All @@ -25,3 +25,30 @@ EOF
max_delay = var.failed_updates_max_delay
}

resource "signalfx_detector" "missing_updates" {
name = format("%s %s", local.detector_name_prefix, "Azure Automation Update missing updates")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('fame.azure.automation_update.missing_updates', filter=filter('classification', 'security', 'critical') and ${module.filtering.signalflow})${var.missing_updates_aggregation_function}${var.missing_updates_transformation_function}.publish('signal')
detect(when(signal > ${var.missing_updates_threshold_major}, lasting=%{if var.missing_updates_lasting_duration_major == null}None%{else}'${var.missing_updates_lasting_duration_major}'%{endif}, at_least=${var.missing_updates_at_least_percentage_major})).publish('MAJOR')
EOF

rule {
description = "is too high > ${var.missing_updates_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.missing_updates_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.missing_updates_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.missing_updates_runbook_url, var.runbook_url), "")
tip = var.missing_updates_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.missing_updates_max_delay
}

5 changes: 5 additions & 0 deletions modules/fame_azure-automation-updates/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,8 @@ output "failed_updates" {
value = signalfx_detector.failed_updates
}

output "missing_updates" {
description = "Detector resource for missing_updates"
value = signalfx_detector.missing_updates
}

71 changes: 66 additions & 5 deletions modules/fame_azure-automation-updates/variables-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,80 @@ variable "failed_updates_disabled" {
default = null
}

variable "failed_updates_threshold_critical" {
description = "Critical threshold for failed_updates detector"
variable "failed_updates_threshold_major" {
description = "Major threshold for failed_updates detector"
type = number
default = 0
}

variable "failed_updates_lasting_duration_critical" {
variable "failed_updates_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = "2h"
default = "6h"
}

variable "failed_updates_at_least_percentage_critical" {
variable "failed_updates_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
# missing_updates detector

variable "missing_updates_notifications" {
description = "Notification recipients list per severity overridden for missing_updates detector"
type = map(list(string))
default = {}
}

variable "missing_updates_aggregation_function" {
description = "Aggregation function and group by for missing_updates detector (i.e. \".mean(by=['host'])\")"
type = string
default = ""
}

variable "missing_updates_transformation_function" {
description = "Transformation function for missing_updates detector (i.e. \".mean(over='5m')\")"
type = string
default = ""
}

variable "missing_updates_max_delay" {
description = "Enforce max delay for missing_updates detector (use \"0\" or \"null\" for \"Auto\")"
type = number
default = 900
}

variable "missing_updates_tip" {
description = "Suggested first course of action or any note useful for incident handling"
type = string
default = ""
}

variable "missing_updates_runbook_url" {
description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause"
type = string
default = ""
}

variable "missing_updates_disabled" {
description = "Disable all alerting rules for missing_updates detector"
type = bool
default = null
}

variable "missing_updates_threshold_major" {
description = "Major threshold for missing_updates detector"
type = number
default = 0
}

variable "missing_updates_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = "35d"
}

variable "missing_updates_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
Expand Down
4 changes: 3 additions & 1 deletion modules/fame_azure-update-center/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ This module creates the following SignalFx detectors which could contain one or

|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Azure Update Center failed updates|X|-|-|-|-|
|Azure Update Center failed updates|-|X|-|-|-|
|Azure Update Center missing updates|-|X|-|-|-|

## How to collect required metrics?

Expand All @@ -95,6 +96,7 @@ Check the [Related documentation](#related-documentation) section for more detai

Here is the list of required metrics for detectors in this module.

* `fame.azure.update_center.missing_updates`
* `fame.azure.update_center.updates_status`


Expand Down
4 changes: 2 additions & 2 deletions modules/fame_azure-update-center/conf/01-failed-updates.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ signals:
filter: filter('status', 'failed')

rules:
critical:
major:
threshold: 0
comparator: ">"
lasting_duration: '2h'
lasting_duration: '6h'
16 changes: 16 additions & 0 deletions modules/fame_azure-update-center/conf/02-missing-updates.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
module: Azure Update Center
name: missing updates
transformation: true
aggregation: true

max_delay: 900
signals:
signal:
metric: fame.azure.update_center.missing_updates
filter: filter('classification', 'security', 'critical')

rules:
major:
threshold: 0
comparator: ">"
lasting_duration: '35d'
37 changes: 32 additions & 5 deletions modules/fame_azure-update-center/detectors-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@ resource "signalfx_detector" "failed_updates" {

program_text = <<-EOF
signal = data('fame.azure.update_center.updates_status', filter=filter('status', 'failed') and ${module.filtering.signalflow})${var.failed_updates_aggregation_function}${var.failed_updates_transformation_function}.publish('signal')
detect(when(signal > ${var.failed_updates_threshold_critical}, lasting=%{if var.failed_updates_lasting_duration_critical == null}None%{else}'${var.failed_updates_lasting_duration_critical}'%{endif}, at_least=${var.failed_updates_at_least_percentage_critical})).publish('CRIT')
detect(when(signal > ${var.failed_updates_threshold_major}, lasting=%{if var.failed_updates_lasting_duration_major == null}None%{else}'${var.failed_updates_lasting_duration_major}'%{endif}, at_least=${var.failed_updates_at_least_percentage_major})).publish('MAJOR')
EOF

rule {
description = "is too high > ${var.failed_updates_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
description = "is too high > ${var.failed_updates_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.failed_updates_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.failed_updates_notifications, "critical", []), var.notifications.critical), null)
notifications = try(coalescelist(lookup(var.failed_updates_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.failed_updates_runbook_url, var.runbook_url), "")
tip = var.failed_updates_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
Expand All @@ -25,3 +25,30 @@ EOF
max_delay = var.failed_updates_max_delay
}

resource "signalfx_detector" "missing_updates" {
name = format("%s %s", local.detector_name_prefix, "Azure Update Center missing updates")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('fame.azure.update_center.missing_updates', filter=filter('classification', 'security', 'critical') and ${module.filtering.signalflow})${var.missing_updates_aggregation_function}${var.missing_updates_transformation_function}.publish('signal')
detect(when(signal > ${var.missing_updates_threshold_major}, lasting=%{if var.missing_updates_lasting_duration_major == null}None%{else}'${var.missing_updates_lasting_duration_major}'%{endif}, at_least=${var.missing_updates_at_least_percentage_major})).publish('MAJOR')
EOF

rule {
description = "is too high > ${var.missing_updates_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.missing_updates_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.missing_updates_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.missing_updates_runbook_url, var.runbook_url), "")
tip = var.missing_updates_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.missing_updates_max_delay
}

5 changes: 5 additions & 0 deletions modules/fame_azure-update-center/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,8 @@ output "failed_updates" {
value = signalfx_detector.failed_updates
}

output "missing_updates" {
description = "Detector resource for missing_updates"
value = signalfx_detector.missing_updates
}

Loading

0 comments on commit db650ab

Please sign in to comment.