From efa00d0443abc4f9cd5ab332defc0e8df638b4b5 Mon Sep 17 00:00:00 2001 From: Mitch Date: Mon, 25 Nov 2024 20:04:57 -0500 Subject: [PATCH 1/2] add terraform for alerting on metrics --- spartan/metrics/terraform/grafana.tf | 223 +++++++++++++++++++++++++ spartan/metrics/terraform/variables.tf | 11 ++ 2 files changed, 234 insertions(+) create mode 100644 spartan/metrics/terraform/grafana.tf create mode 100644 spartan/metrics/terraform/variables.tf diff --git a/spartan/metrics/terraform/grafana.tf b/spartan/metrics/terraform/grafana.tf new file mode 100644 index 000000000000..930a7b274edb --- /dev/null +++ b/spartan/metrics/terraform/grafana.tf @@ -0,0 +1,223 @@ +terraform { + required_providers { + grafana = { + source = "grafana/grafana" + version = "~> 3.13.2" + } + } +} + +provider "grafana" { + url = var.grafana_url + auth = var.grafana_auth +} + +resource "grafana_folder" "rule_folder" { + title = "Alerting Rules" +} + + +resource "grafana_contact_point" "slack" { + name = "slack" + + slack { + url = var.slack_url + } +} + +resource "grafana_notification_policy" "ignore_policy" { + contact_point = grafana_contact_point.slack.name + group_by = ["service_namespace"] + + policy { + contact_point = grafana_contact_point.slack.name + + matcher { + label = "service_namespace" + match = "=" + value = "smoke" + } + + mute_timings = ["always"] + } +} + +resource "grafana_mute_timing" "mute_timing_always" { + name = "always" + + intervals { + } +} + +resource "grafana_rule_group" "rule_group_minutely" { + org_id = 1 + name = "minutely-evaluation-group" + folder_uid = grafana_folder.rule_folder.uid + interval_seconds = 60 + + rule { + name = "Proven Chain is Live" + condition = "B" + + data { + ref_id = "A" + + relative_time_range { + from = 600 + to = 0 + } + + datasource_uid = "spartan-metrics-prometheus" + model = jsonencode({ + disableTextWrap = false, + editorMode = "code", + expr = "increase(aztec_archiver_block_height{aztec_status=\"proven\"}[30m])", + fullMetaSearch = false, + includeNullMetadata = true, + instant = true, + intervalMs = 1000, + legendFormat = "__auto", + maxDataPoints = 43200, + range = false, + refId = "A", + useBackend = false + + }) + } + data { + ref_id = "B" + + relative_time_range { + from = 600 + to = 0 + } + + datasource_uid = "__expr__" + model = jsonencode( + { + conditions = [ + { + evaluator = { params = [1], type = "lt" }, + operator = { type = "and" }, + query = { params = ["C"] }, + reducer = { params = [], type = "last" }, + type = "query" + } + ], + datasource = { type = "__expr__", uid = "__expr__" }, + expression = "A", + intervalMs = 1000, + maxDataPoints = 43200, + refId = "C", + type = "threshold" + } + ) + } + + no_data_state = "NoData" + exec_err_state = "Error" + for = "1m" + annotations = {} + labels = {} + is_paused = false + } + + rule { + name = "Has Peers" + condition = "C" + + data { + ref_id = "A" + + relative_time_range { + from = 600 + to = 0 + } + + datasource_uid = "spartan-metrics-prometheus" + model = jsonencode({ + disableTextWrap = false, + editorMode = "builder", + expr = "discv5_connected_peer_count", + fullMetaSearch = false, + includeNullMetadata = true, + instant = true, + intervalMs = 1000, + legendFormat = "__auto", + maxDataPoints = 43200, + range = false, + refId = "A", + useBackend = false + } + ) + } + data { + ref_id = "B" + + relative_time_range { + from = 600 + to = 0 + } + + datasource_uid = "__expr__" + model = jsonencode({ + conditions = [ + { + evaluator = { params = [], type = "gt" }, + operator = { type = "and" }, + query = { params = ["B"] }, + reducer = { params = [], type = "last" }, + type = "query" + } + ], + datasource = { type = "__expr__", uid = "__expr__" }, + expression = "A", + intervalMs = 1000, + maxDataPoints = 43200, + reducer = "last", + refId = "B", + type = "reduce" + } + ) + + } + data { + ref_id = "C" + + relative_time_range { + from = 600 + to = 0 + } + + datasource_uid = "__expr__" + + model = jsonencode({ + conditions = [ + { + evaluator = { params = [1], type = "lt" }, + operator = { type = "and" }, + query = { params = ["C"] }, + reducer = { params = [], type = "last" }, + type = "query" + } + ], + datasource = { type = "__expr__", uid = "__expr__" }, + expression = "B", + intervalMs = 1000, + maxDataPoints = 43200, + refId = "C", + type = "threshold" + } + ) + } + + no_data_state = "NoData" + exec_err_state = "Error" + for = "5m" + annotations = {} + labels = {} + is_paused = false + } + + +} diff --git a/spartan/metrics/terraform/variables.tf b/spartan/metrics/terraform/variables.tf new file mode 100644 index 000000000000..8292a25c328c --- /dev/null +++ b/spartan/metrics/terraform/variables.tf @@ -0,0 +1,11 @@ +variable "grafana_url" { + type = string +} + +variable "grafana_auth" { + type = string +} + +variable "slack_url" { + type = string +} From bbf116281a732dc774ea0faf23c8d0d002e94eee Mon Sep 17 00:00:00 2001 From: Mitch Date: Mon, 25 Nov 2024 20:12:14 -0500 Subject: [PATCH 2/2] remove peer alert rule --- spartan/metrics/terraform/grafana.tf | 100 +-------------------------- 1 file changed, 2 insertions(+), 98 deletions(-) diff --git a/spartan/metrics/terraform/grafana.tf b/spartan/metrics/terraform/grafana.tf index 930a7b274edb..340d24a43700 100644 --- a/spartan/metrics/terraform/grafana.tf +++ b/spartan/metrics/terraform/grafana.tf @@ -1,3 +1,5 @@ +# See https://registry.terraform.io/providers/grafana/grafana/latest/docs + terraform { required_providers { grafana = { @@ -122,102 +124,4 @@ resource "grafana_rule_group" "rule_group_minutely" { is_paused = false } - rule { - name = "Has Peers" - condition = "C" - - data { - ref_id = "A" - - relative_time_range { - from = 600 - to = 0 - } - - datasource_uid = "spartan-metrics-prometheus" - model = jsonencode({ - disableTextWrap = false, - editorMode = "builder", - expr = "discv5_connected_peer_count", - fullMetaSearch = false, - includeNullMetadata = true, - instant = true, - intervalMs = 1000, - legendFormat = "__auto", - maxDataPoints = 43200, - range = false, - refId = "A", - useBackend = false - } - ) - } - data { - ref_id = "B" - - relative_time_range { - from = 600 - to = 0 - } - - datasource_uid = "__expr__" - model = jsonencode({ - conditions = [ - { - evaluator = { params = [], type = "gt" }, - operator = { type = "and" }, - query = { params = ["B"] }, - reducer = { params = [], type = "last" }, - type = "query" - } - ], - datasource = { type = "__expr__", uid = "__expr__" }, - expression = "A", - intervalMs = 1000, - maxDataPoints = 43200, - reducer = "last", - refId = "B", - type = "reduce" - } - ) - - } - data { - ref_id = "C" - - relative_time_range { - from = 600 - to = 0 - } - - datasource_uid = "__expr__" - - model = jsonencode({ - conditions = [ - { - evaluator = { params = [1], type = "lt" }, - operator = { type = "and" }, - query = { params = ["C"] }, - reducer = { params = [], type = "last" }, - type = "query" - } - ], - datasource = { type = "__expr__", uid = "__expr__" }, - expression = "B", - intervalMs = 1000, - maxDataPoints = 43200, - refId = "C", - type = "threshold" - } - ) - } - - no_data_state = "NoData" - exec_err_state = "Error" - for = "5m" - annotations = {} - labels = {} - is_paused = false - } - - }