From 43c2fbf57299b12968093aaf679f1fccc552d9cd Mon Sep 17 00:00:00 2001 From: Benjamin DUPUIS Date: Mon, 25 Sep 2023 09:46:33 +0200 Subject: [PATCH] Add Docker State Exporter --- docs/severity.md | 11 + .../README.md | 118 +++++++++ .../common-filters.tf | 1 + .../common-locals.tf | 1 + .../common-modules.tf | 1 + .../common-variables.tf | 1 + .../common-versions.tf | 1 + .../conf/00-heartbeat.yaml | 13 + .../conf/01-state-health-status.yaml | 15 ++ .../conf/02-state-status.yaml | 15 ++ .../conf/03-state-oomkilled.yaml | 15 ++ .../conf/readme.yaml | 3 + .../detectors-gen.tf | 112 +++++++++ .../outputs.tf | 20 ++ .../prometheus-exporter_docker-state/tags.tf | 4 + .../variables-gen.tf | 227 ++++++++++++++++++ 16 files changed, 558 insertions(+) create mode 100644 modules/prometheus-exporter_docker-state/README.md create mode 120000 modules/prometheus-exporter_docker-state/common-filters.tf create mode 120000 modules/prometheus-exporter_docker-state/common-locals.tf create mode 120000 modules/prometheus-exporter_docker-state/common-modules.tf create mode 120000 modules/prometheus-exporter_docker-state/common-variables.tf create mode 120000 modules/prometheus-exporter_docker-state/common-versions.tf create mode 100644 modules/prometheus-exporter_docker-state/conf/00-heartbeat.yaml create mode 100644 modules/prometheus-exporter_docker-state/conf/01-state-health-status.yaml create mode 100644 modules/prometheus-exporter_docker-state/conf/02-state-status.yaml create mode 100644 modules/prometheus-exporter_docker-state/conf/03-state-oomkilled.yaml create mode 100644 modules/prometheus-exporter_docker-state/conf/readme.yaml create mode 100644 modules/prometheus-exporter_docker-state/detectors-gen.tf create mode 100644 modules/prometheus-exporter_docker-state/outputs.tf create mode 100644 modules/prometheus-exporter_docker-state/tags.tf create mode 100644 modules/prometheus-exporter_docker-state/variables-gen.tf diff --git a/docs/severity.md b/docs/severity.md index b35e2c92e..143a57ea0 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -77,6 +77,7 @@ - [organization_usage](#organization_usage) - [otel-collector_kubernetes-common](#otel-collector_kubernetes-common) - [prometheus-exporter_active-directory](#prometheus-exporter_active-directory) +- [prometheus-exporter_docker-state](#prometheus-exporter_docker-state) - [prometheus-exporter_kong](#prometheus-exporter_kong) - [prometheus-exporter_oracledb](#prometheus-exporter_oracledb) - [prometheus-exporter_squid](#prometheus-exporter_squid) @@ -851,6 +852,16 @@ |Active-directory active directory services|X|-|-|-|-| +## prometheus-exporter_docker-state + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Docker-state heartbeat|X|-|-|-|-| +|Docker-state state health status|X|-|-|-|-| +|Docker-state state status|X|-|-|-|-| +|Docker-state state oom killed|X|-|-|-|-| + + ## prometheus-exporter_kong |Detector|Critical|Major|Minor|Warning|Info| diff --git a/modules/prometheus-exporter_docker-state/README.md b/modules/prometheus-exporter_docker-state/README.md new file mode 100644 index 000000000..8c68e67fb --- /dev/null +++ b/modules/prometheus-exporter_docker-state/README.md @@ -0,0 +1,118 @@ +# DOCKER-STATE SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/language/modules/syntax) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-prometheus-exporter-docker-state" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/prometheus-exporter_docker-state?ref={revision}" + + environment = var.environment + notifications = local.notifications +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/language/modules/sources)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/language/expressions/type-constraints#object) where each key represents an available + [detector rule severity](https://docs.splunk.com/observability/alerts-detectors-notifications/create-detectors-for-alerts.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters alongs with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/language/values/variables) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailled in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Docker-state heartbeat|X|-|-|-|-| +|Docker-state state health status|X|-|-|-|-| +|Docker-state state status|X|-|-|-|-| +|Docker-state state oom killed|X|-|-|-|-| + +## How to collect required metrics? + +This module deploys detectors using metrics reported by the +scraping of a server following the [OpenMetrics convention](https://openmetrics.io/) based on and compatible with [the Prometheus +exposition format](https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#openmetrics-text-format). + +They are generally called `Prometheus Exporters` which can be fetched by both the [SignalFx Smart Agent](https://github.com/signalfx/signalfx-agent) +thanks to its [prometheus exporter monitor](https://github.com/signalfx/signalfx-agent/blob/main/docs/monitors/prometheus-exporter.md) and the +[OpenTelemetry Collector](https://github.com/signalfx/splunk-otel-collector) using its [prometheus +receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) or its derivates. + +These exporters could be embedded directly in the tool you want to monitor (e.g. nginx ingress) or must be installed next to it as +a separate program configured to connect, create metrics and expose them as server. + + +Check the [Related documentation](#related-documentation) section for more detailed and specific information about this module dependencies. + + + +### Metrics + + +Here is the list of required metrics for detectors in this module. + +* `container_state_health_status` +* `container_state_oomkilled` +* `container_state_status` + + + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) +* [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) diff --git a/modules/prometheus-exporter_docker-state/common-filters.tf b/modules/prometheus-exporter_docker-state/common-filters.tf new file mode 120000 index 000000000..51ac61525 --- /dev/null +++ b/modules/prometheus-exporter_docker-state/common-filters.tf @@ -0,0 +1 @@ +../../common/module/filters-prometheus-exporter.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_docker-state/common-locals.tf b/modules/prometheus-exporter_docker-state/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/prometheus-exporter_docker-state/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_docker-state/common-modules.tf b/modules/prometheus-exporter_docker-state/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/prometheus-exporter_docker-state/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_docker-state/common-variables.tf b/modules/prometheus-exporter_docker-state/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/prometheus-exporter_docker-state/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_docker-state/common-versions.tf b/modules/prometheus-exporter_docker-state/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/prometheus-exporter_docker-state/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_docker-state/conf/00-heartbeat.yaml b/modules/prometheus-exporter_docker-state/conf/00-heartbeat.yaml new file mode 100644 index 000000000..8fd760d2f --- /dev/null +++ b/modules/prometheus-exporter_docker-state/conf/00-heartbeat.yaml @@ -0,0 +1,13 @@ +## Example +module: docker-state +name: heartbeat + +transformation: false +aggregation: true +exclude_not_running_vm: true + +signals: + signal: + metric: "container_state_status" +rules: + critical: diff --git a/modules/prometheus-exporter_docker-state/conf/01-state-health-status.yaml b/modules/prometheus-exporter_docker-state/conf/01-state-health-status.yaml new file mode 100644 index 000000000..6dc3f6fd7 --- /dev/null +++ b/modules/prometheus-exporter_docker-state/conf/01-state-health-status.yaml @@ -0,0 +1,15 @@ +module: docker-state +name: State Health Status + +transformation: ".min(over='5m')" +aggregation: true + +filtering: "filter('service.name', 'docker-state-exporter') and filter('status', 'unhealthy')" + +signals: + signal: + metric: container_state_health_status +rules: + critical: + threshold: 0 + comparator: ">" diff --git a/modules/prometheus-exporter_docker-state/conf/02-state-status.yaml b/modules/prometheus-exporter_docker-state/conf/02-state-status.yaml new file mode 100644 index 000000000..d6e593407 --- /dev/null +++ b/modules/prometheus-exporter_docker-state/conf/02-state-status.yaml @@ -0,0 +1,15 @@ +module: docker-state +name: State Status + +transformation: ".min(over='5m')" +aggregation: true + +filtering: "filter('service.name', 'docker-state-exporter') and not filter('status', 'running')" + +signals: + signal: + metric: container_state_status +rules: + critical: + threshold: 0 + comparator: ">" diff --git a/modules/prometheus-exporter_docker-state/conf/03-state-oomkilled.yaml b/modules/prometheus-exporter_docker-state/conf/03-state-oomkilled.yaml new file mode 100644 index 000000000..b625aef97 --- /dev/null +++ b/modules/prometheus-exporter_docker-state/conf/03-state-oomkilled.yaml @@ -0,0 +1,15 @@ +module: docker-state +name: State OOM Killed + +transformation: ".min(over='5m')" +aggregation: true + +filtering: "filter('service.name', 'docker-state-exporter')" + +signals: + signal: + metric: container_state_oomkilled +rules: + critical: + threshold: 0 + comparator: ">" diff --git a/modules/prometheus-exporter_docker-state/conf/readme.yaml b/modules/prometheus-exporter_docker-state/conf/readme.yaml new file mode 100644 index 000000000..9015fc41a --- /dev/null +++ b/modules/prometheus-exporter_docker-state/conf/readme.yaml @@ -0,0 +1,3 @@ +documentations: + +source_doc: diff --git a/modules/prometheus-exporter_docker-state/detectors-gen.tf b/modules/prometheus-exporter_docker-state/detectors-gen.tf new file mode 100644 index 000000000..8ec231c4d --- /dev/null +++ b/modules/prometheus-exporter_docker-state/detectors-gen.tf @@ -0,0 +1,112 @@ +resource "signalfx_detector" "heartbeat" { + name = format("%s %s", local.detector_name_prefix, "Docker-state heartbeat") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + from signalfx.detectors.not_reporting import not_reporting + signal = data('container_state_status', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') +EOF + + rule { + description = "has not reported in ${var.heartbeat_timeframe}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.heartbeat_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.heartbeat_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.heartbeat_runbook_url, var.runbook_url), "") + tip = var.heartbeat_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.heartbeat_max_delay +} + +resource "signalfx_detector" "state_health_status" { + name = format("%s %s", local.detector_name_prefix, "Docker-state state health status") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('service.name', 'docker-state-exporter') and filter('status', 'unhealthy') + signal = data('container_state_health_status', filter=base_filtering and ${module.filtering.signalflow})${var.state_health_status_aggregation_function}${var.state_health_status_transformation_function}.publish('signal') + detect(when(signal > ${var.state_health_status_threshold_critical}, lasting=%{if var.state_health_status_lasting_duration_critical == null}None%{else}'${var.state_health_status_lasting_duration_critical}'%{endif}, at_least=${var.state_health_status_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is too high > ${var.state_health_status_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.state_health_status_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.state_health_status_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.state_health_status_runbook_url, var.runbook_url), "") + tip = var.state_health_status_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.state_health_status_max_delay +} + +resource "signalfx_detector" "state_status" { + name = format("%s %s", local.detector_name_prefix, "Docker-state state status") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('service.name', 'docker-state-exporter') and not filter('status', 'running') + signal = data('container_state_status', filter=base_filtering and ${module.filtering.signalflow})${var.state_status_aggregation_function}${var.state_status_transformation_function}.publish('signal') + detect(when(signal > ${var.state_status_threshold_critical}, lasting=%{if var.state_status_lasting_duration_critical == null}None%{else}'${var.state_status_lasting_duration_critical}'%{endif}, at_least=${var.state_status_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is too high > ${var.state_status_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.state_status_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.state_status_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.state_status_runbook_url, var.runbook_url), "") + tip = var.state_status_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.state_status_max_delay +} + +resource "signalfx_detector" "state_oom_killed" { + name = format("%s %s", local.detector_name_prefix, "Docker-state state oom killed") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('service.name', 'docker-state-exporter') + signal = data('container_state_oomkilled', filter=base_filtering and ${module.filtering.signalflow})${var.state_oom_killed_aggregation_function}${var.state_oom_killed_transformation_function}.publish('signal') + detect(when(signal > ${var.state_oom_killed_threshold_critical}, lasting=%{if var.state_oom_killed_lasting_duration_critical == null}None%{else}'${var.state_oom_killed_lasting_duration_critical}'%{endif}, at_least=${var.state_oom_killed_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is too high > ${var.state_oom_killed_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.state_oom_killed_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.state_oom_killed_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.state_oom_killed_runbook_url, var.runbook_url), "") + tip = var.state_oom_killed_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.state_oom_killed_max_delay +} + diff --git a/modules/prometheus-exporter_docker-state/outputs.tf b/modules/prometheus-exporter_docker-state/outputs.tf new file mode 100644 index 000000000..22376098e --- /dev/null +++ b/modules/prometheus-exporter_docker-state/outputs.tf @@ -0,0 +1,20 @@ +output "heartbeat" { + description = "Detector resource for heartbeat" + value = signalfx_detector.heartbeat +} + +output "state_health_status" { + description = "Detector resource for state_health_status" + value = signalfx_detector.state_health_status +} + +output "state_oom_killed" { + description = "Detector resource for state_oom_killed" + value = signalfx_detector.state_oom_killed +} + +output "state_status" { + description = "Detector resource for state_status" + value = signalfx_detector.state_status +} + diff --git a/modules/prometheus-exporter_docker-state/tags.tf b/modules/prometheus-exporter_docker-state/tags.tf new file mode 100644 index 000000000..84d30d073 --- /dev/null +++ b/modules/prometheus-exporter_docker-state/tags.tf @@ -0,0 +1,4 @@ +locals { + tags = ["prometheus-exporter", "docker-state"] +} + diff --git a/modules/prometheus-exporter_docker-state/variables-gen.tf b/modules/prometheus-exporter_docker-state/variables-gen.tf new file mode 100644 index 000000000..0778189d2 --- /dev/null +++ b/modules/prometheus-exporter_docker-state/variables-gen.tf @@ -0,0 +1,227 @@ +# heartbeat detector + +variable "heartbeat_notifications" { + description = "Notification recipients list per severity overridden for heartbeat detector" + type = map(list(string)) + default = {} +} + +variable "heartbeat_aggregation_function" { + description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "heartbeat_max_delay" { + description = "Enforce max delay for heartbeat detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = 900 +} + +variable "heartbeat_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "heartbeat_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "heartbeat_disabled" { + description = "Disable all alerting rules for heartbeat detector" + type = bool + default = null +} + +variable "heartbeat_timeframe" { + description = "Timeframe for heartbeat detector (i.e. \"10m\")" + type = string + default = "10m" +} + +# state_health_status detector + +variable "state_health_status_notifications" { + description = "Notification recipients list per severity overridden for state_health_status detector" + type = map(list(string)) + default = {} +} + +variable "state_health_status_aggregation_function" { + description = "Aggregation function and group by for state_health_status detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "state_health_status_transformation_function" { + description = "Transformation function for state_health_status detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='5m')" +} + +variable "state_health_status_max_delay" { + description = "Enforce max delay for state_health_status detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "state_health_status_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "state_health_status_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "state_health_status_disabled" { + description = "Disable all alerting rules for state_health_status detector" + type = bool + default = null +} + +variable "state_health_status_threshold_critical" { + description = "Critical threshold for state_health_status detector" + type = number + default = 0 +} + +variable "state_health_status_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "state_health_status_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# state_status detector + +variable "state_status_notifications" { + description = "Notification recipients list per severity overridden for state_status detector" + type = map(list(string)) + default = {} +} + +variable "state_status_aggregation_function" { + description = "Aggregation function and group by for state_status detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "state_status_transformation_function" { + description = "Transformation function for state_status detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='5m')" +} + +variable "state_status_max_delay" { + description = "Enforce max delay for state_status detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "state_status_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "state_status_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "state_status_disabled" { + description = "Disable all alerting rules for state_status detector" + type = bool + default = null +} + +variable "state_status_threshold_critical" { + description = "Critical threshold for state_status detector" + type = number + default = 0 +} + +variable "state_status_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "state_status_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# state_oom_killed detector + +variable "state_oom_killed_notifications" { + description = "Notification recipients list per severity overridden for state_oom_killed detector" + type = map(list(string)) + default = {} +} + +variable "state_oom_killed_aggregation_function" { + description = "Aggregation function and group by for state_oom_killed detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "state_oom_killed_transformation_function" { + description = "Transformation function for state_oom_killed detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='5m')" +} + +variable "state_oom_killed_max_delay" { + description = "Enforce max delay for state_oom_killed detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "state_oom_killed_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "state_oom_killed_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "state_oom_killed_disabled" { + description = "Disable all alerting rules for state_oom_killed detector" + type = bool + default = null +} + +variable "state_oom_killed_threshold_critical" { + description = "Critical threshold for state_oom_killed detector" + type = number + default = 0 +} + +variable "state_oom_killed_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "state_oom_killed_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +}