Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flush out Pagerduty Severity functionality #61

Merged
merged 5 commits into from
May 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions example-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ listen_port: 8888
hide_logs: no
# How long to wait before alerting that a node is down.
node_down_alert_minutes: 3
# Node Down alert Pagerduty Severity
node_down_alert_severity: critical

# Should the prometheus exporter be enabled?
prometheus_enabled: yes
Expand Down Expand Up @@ -74,15 +76,15 @@ chains:
consecutive_enabled: yes
# How many missed blocks should trigger a notification?
consecutive_missed: 5
# NOT USED: future hint for pagerduty's routing
# Consecutive Missed alert Pagerduty Severity
consecutive_priority: critical

# For each chain there is a specific window of blocks and a percentage of missed blocks that will result in
# a downtime jail infraction. Should an alert be sent if a certain percentage of this window is exceeded?
percentage_enabled: no
# What percentage should trigger the alert
percentage_missed: 10
# Not used yet, pagerduty routing hint
# Percentage Missed alert Pagerduty Severity
percentage_priority: warning

# Should an alert be sent if the validator is not in the active set ie, jailed,
Expand Down
31 changes: 16 additions & 15 deletions td2/alert.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@ import (
"context"
"encoding/json"
"fmt"
"github.com/PagerDuty/go-pagerduty"
tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5"
"log"
"net/http"
"strings"
"sync"
"time"

"github.com/PagerDuty/go-pagerduty"
tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5"
)

type alertMsg struct {
Expand All @@ -34,8 +35,8 @@ type alertMsg struct {
discHook string
discMentions string

slkHook string
slkMentions string
slkHook string
slkMentions string
}

type notifyDest uint8
Expand Down Expand Up @@ -206,9 +207,9 @@ func buildSlackMessage(msg *alertMsg) *SlackMessage {
return &SlackMessage{
Text: msg.message,
Attachments: []Attachment{
Attachment{
Title: fmt.Sprintf("TenderDuty %s %s %s", prefix, msg.chain, msg.slkMentions),
Color: color,
{
Title: fmt.Sprintf("TenderDuty %s %s %s", prefix, msg.chain, msg.slkMentions),
Color: color,
},
},
}
Expand Down Expand Up @@ -481,7 +482,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("stalled: have not seen a new block on %s in %d minutes", cc.ChainId, cc.Alerts.Stalled),
"critical",
"info",
true,
&cc.valInfo.Valcons,
)
Expand Down Expand Up @@ -525,7 +526,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("%s has missed %d blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveMissed, cc.ChainId),
"critical",
cc.Alerts.ConsecutivePriority,
false,
&id,
)
Expand All @@ -537,7 +538,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("%s has missed %d blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveMissed, cc.ChainId),
"critical",
"info",
true,
&id,
)
Expand All @@ -552,7 +553,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("%s has missed > %d%% of the slashing window's blocks on %s", cc.valInfo.Moniker, cc.Alerts.Window, cc.ChainId),
"critical",
cc.Alerts.PercentagePriority,
false,
&id,
)
Expand All @@ -564,7 +565,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("%s has missed > %d%% of the slashing window's blocks on %s", cc.valInfo.Moniker, cc.Alerts.Window, cc.ChainId),
"critical",
"info",
false,
&id,
)
Expand All @@ -585,8 +586,8 @@ func (cc *ChainConfig) watch() {
nodeAlarms[node.Url] = true // used to keep active alert count correct
td.alert(
cc.name,
fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId),
"critical",
fmt.Sprintf("Severity: %s\nRPC node %s has been down for > %d minutes on %s", td.NodeDownSeverity, node.Url, td.NodeDownMin, cc.ChainId),
td.NodeDownSeverity,
false,
&node.Url,
)
Expand All @@ -596,7 +597,7 @@ func (cc *ChainConfig) watch() {
node.wasDown = false
td.alert(
cc.name,
fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId),
fmt.Sprintf("Severity: %s\nRPC node %s has been down for > %d minutes on %s", td.NodeDownSeverity, node.Url, td.NodeDownMin, cc.ChainId),
"info",
true,
&node.Url,
Expand Down
2 changes: 2 additions & 0 deletions td2/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ type Config struct {
// NodeDownMin controls how long we wait before sending an alert that a node is not responding or has
// fallen behind.
NodeDownMin int `yaml:"node_down_alert_minutes"`
// NodeDownSeverity controls the Pagerduty severity when notifying if a node is down.
NodeDownSeverity string `yaml:"node_down_alert_severity"`

// Prom controls if the prometheus exporter is enabled.
Prom bool `yaml:"prometheus_enabled"`
Expand Down